From 191f90cbea08122b48107f1f9116106fbf3bdfac Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 14 May 2020 11:14:39 -0400 Subject: [PATCH 01/58] msix: allow qword MSI-X table accesses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI spec says: For all accesses to MSI-X Table and MSI-X PBA fields, software must use aligned full DWORD or aligned full QWORD transactions; otherwise, the result is undefined. However, since MSI-X was converted to use memory API, QEMU started blocking qword transactions, only allowing DWORD ones. Guests do not seem to use QWORD accesses, but let's be spec compliant. Fixes: 95524ae8dc8f ("msix: convert to memory API") Signed-off-by: Michael S. Tsirkin Reviewed-by: Philippe Mathieu-Daudé --- hw/pci/msix.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/pci/msix.c b/hw/pci/msix.c index 29187898f2..e6a5559038 100644 --- a/hw/pci/msix.c +++ b/hw/pci/msix.c @@ -199,6 +199,9 @@ static const MemoryRegionOps msix_table_mmio_ops = { .endianness = DEVICE_LITTLE_ENDIAN, .valid = { .min_access_size = 4, + .max_access_size = 8, + }, + .impl = { .max_access_size = 4, }, }; @@ -227,6 +230,9 @@ static const MemoryRegionOps msix_pba_mmio_ops = { .endianness = DEVICE_LITTLE_ENDIAN, .valid = { .min_access_size = 4, + .max_access_size = 8, + }, + .impl = { .max_access_size = 4, }, }; From acc5c98ddd8f85d41fd22e57cf289aeaa34f160a Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Fri, 5 Jun 2020 18:09:09 -0600 Subject: [PATCH 02/58] diffs-allowed: add the SRAT AML to diffs-allowed In anticipation of a change to the SRAT generation in qemu, add the AML file to diffs-allowed. Signed-off-by: Vishal Verma Message-Id: <20200606000911.9896-2-vishal.l.verma@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/qtest/bios-tables-test-allowed-diff.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..e8f2766a63 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,4 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/pc/SRAT.dimmpxm", +"tests/data/acpi/q35/SRAT.dimmpxm", +"tests/data/acpi/virt/SRAT.memhp", From c3b0cf6e7dbc48ecf84ace4224f220fa9baa85b6 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Fri, 5 Jun 2020 18:09:10 -0600 Subject: [PATCH 03/58] hw/acpi/nvdimm: add a helper to augment SRAT generation NVDIMMs can belong to their own proximity domains, as described by the NFIT. In such cases, the SRAT needs to have Memory Affinity structures in the SRAT for these NVDIMMs, otherwise Linux doesn't populate node data structures properly during NUMA initialization. See the following for an example failure case. https://lore.kernel.org/linux-nvdimm/20200416225438.15208-1-vishal.l.verma@intel.com/ Introduce a new helper, nvdimm_build_srat(), and call it for both the i386 and arm versions of 'build_srat()' to augment the SRAT with memory affinity information for NVDIMMs. The relevant command line options to exercise this are below. Nodes 0-1 contain CPUs and regular memory, and nodes 2-3 are the NVDIMM address space. -object memory-backend-ram,id=mem0,size=2048M -numa node,nodeid=0,memdev=mem0, -numa cpu,node-id=0,socket-id=0 -object memory-backend-ram,id=mem1,size=2048M -numa node,nodeid=1,memdev=mem1, -numa cpu,node-id=1,socket-id=1 -numa node,nodeid=2, -object memory-backend-file,id=nvmem0,share,mem-path=nvdimm-0,size=16384M,align=1G -device nvdimm,memdev=nvmem0,id=nv0,label-size=2M,node=2 -numa node,nodeid=3, -object memory-backend-file,id=nvmem1,share,mem-path=nvdimm-1,size=16384M,align=1G -device nvdimm,memdev=nvmem1,id=nv1,label-size=2M,node=3 Cc: Jingqi Liu Cc: Michael S. Tsirkin Reviewed-by: Jingqi Liu Reviewed-by: Igor Mammedov Signed-off-by: Vishal Verma Message-Id: <20200606000911.9896-3-vishal.l.verma@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/nvdimm.c | 23 +++++++++++++++++++++++ hw/arm/virt-acpi-build.c | 4 ++++ hw/i386/acpi-build.c | 5 +++++ include/hw/mem/nvdimm.h | 1 + 4 files changed, 33 insertions(+) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 9316d12b70..8f7cc16add 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -28,6 +28,7 @@ #include "qemu/osdep.h" #include "qemu/uuid.h" +#include "qapi/error.h" #include "hw/acpi/acpi.h" #include "hw/acpi/aml-build.h" #include "hw/acpi/bios-linker-loader.h" @@ -1334,6 +1335,28 @@ static void nvdimm_build_ssdt(GArray *table_offsets, GArray *table_data, free_aml_allocator(); } +void nvdimm_build_srat(GArray *table_data) +{ + GSList *device_list = nvdimm_get_device_list(); + + for (; device_list; device_list = device_list->next) { + AcpiSratMemoryAffinity *numamem = NULL; + DeviceState *dev = device_list->data; + Object *obj = OBJECT(dev); + uint64_t addr, size; + int node; + + node = object_property_get_int(obj, PC_DIMM_NODE_PROP, &error_abort); + addr = object_property_get_uint(obj, PC_DIMM_ADDR_PROP, &error_abort); + size = object_property_get_uint(obj, PC_DIMM_SIZE_PROP, &error_abort); + + numamem = acpi_data_push(table_data, sizeof *numamem); + build_srat_memory(numamem, addr, size, node, + MEM_AFFINITY_ENABLED | MEM_AFFINITY_NON_VOLATILE); + } + g_slist_free(device_list); +} + void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data, BIOSLinker *linker, NVDIMMState *state, uint32_t ram_slots) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 1b0a584c7b..2cbccd5fe2 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -539,6 +539,10 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } } + if (ms->nvdimms_state->is_enabled) { + nvdimm_build_srat(table_data); + } + if (ms->device_memory) { numamem = acpi_data_push(table_data, sizeof *numamem); build_srat_memory(numamem, ms->device_memory->base, diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 2e15f6848e..d996525e2c 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2428,6 +2428,11 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) MEM_AFFINITY_ENABLED); } } + + if (machine->nvdimms_state->is_enabled) { + nvdimm_build_srat(table_data); + } + slots = (table_data->len - numa_start) / sizeof *numamem; for (; slots < pcms->numa_nodes + 2; slots++) { numamem = acpi_data_push(table_data, sizeof *numamem); diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index a3c08955e8..b67a1aedf6 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -155,6 +155,7 @@ typedef struct NVDIMMState NVDIMMState; void nvdimm_init_acpi_state(NVDIMMState *state, MemoryRegion *io, struct AcpiGenericAddress dsm_io, FWCfgState *fw_cfg, Object *owner); +void nvdimm_build_srat(GArray *table_data); void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data, BIOSLinker *linker, NVDIMMState *state, uint32_t ram_slots); From 8a49b30034da8c5440f425a94e7fb12425f35283 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Fri, 5 Jun 2020 18:09:11 -0600 Subject: [PATCH 04/58] tests/acpi: update expected SRAT files Update expected SRAT files for the change to account for NVDIMM NUMA nodes in the SRAT. AML diffs: tests/data/acpi/pc/SRAT.dimmpxm: Message-Id: <20200606000911.9896-4-vishal.l.verma@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/data/acpi/pc/SRAT.dimmpxm | Bin 392 -> 392 bytes tests/data/acpi/q35/SRAT.dimmpxm | Bin 392 -> 392 bytes tests/data/acpi/virt/SRAT.memhp | Bin 186 -> 226 bytes tests/qtest/bios-tables-test-allowed-diff.h | 3 --- 4 files changed, 3 deletions(-) diff --git a/tests/data/acpi/pc/SRAT.dimmpxm b/tests/data/acpi/pc/SRAT.dimmpxm index f5c0267ea24bb404b6b4e687390140378fbdc3f1..5a13c61b9041c6045c29643bf93a111fb1c0c76a 100644 GIT binary patch delta 51 scmeBR?qKE$4ss0XU}Rum%-G0fz$nec00kUCF%aN@Pz(&LlS3Je0lmQmhyVZp delta 51 icmeBR?qKE$4ss0XU}RumY}m+Uz$ndt8%z#mGzI{_tp$hx diff --git a/tests/data/acpi/q35/SRAT.dimmpxm b/tests/data/acpi/q35/SRAT.dimmpxm index f5c0267ea24bb404b6b4e687390140378fbdc3f1..5a13c61b9041c6045c29643bf93a111fb1c0c76a 100644 GIT binary patch delta 51 scmeBR?qKE$4ss0XU}Rum%-G0fz$nec00kUCF%aN@Pz(&LlS3Je0lmQmhyVZp delta 51 icmeBR?qKE$4ss0XU}RumY}m+Uz$ndt8%z#mGzI{_tp$hx diff --git a/tests/data/acpi/virt/SRAT.memhp b/tests/data/acpi/virt/SRAT.memhp index 1b57db2072e7f7e2085c4a427aa31c7383851b71..9a35adb40c6f7cd822e5af37abba8aad033617cb 100644 GIT binary patch delta 43 rcmdnR_=u4!ILI;N5d#AQbIe4p$wD1K76@=aC Date: Fri, 15 May 2020 17:04:06 +0200 Subject: [PATCH 05/58] qtest: allow DSDT acpi table changes Signed-off-by: Gerd Hoffmann Message-Id: <20200515150421.25479-2-kraxel@redhat.com> --- tests/qtest/bios-tables-test-allowed-diff.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..6a052c5044 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,18 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/pc/DSDT", +"tests/data/acpi/pc/DSDT.acpihmat", +"tests/data/acpi/pc/DSDT.bridge", +"tests/data/acpi/pc/DSDT.cphp", +"tests/data/acpi/pc/DSDT.dimmpxm", +"tests/data/acpi/pc/DSDT.ipmikcs", +"tests/data/acpi/pc/DSDT.memhp", +"tests/data/acpi/pc/DSDT.numamem", +"tests/data/acpi/q35/DSDT", +"tests/data/acpi/q35/DSDT.acpihmat", +"tests/data/acpi/q35/DSDT.bridge", +"tests/data/acpi/q35/DSDT.cphp", +"tests/data/acpi/q35/DSDT.dimmpxm", +"tests/data/acpi/q35/DSDT.ipmibt", +"tests/data/acpi/q35/DSDT.memhp", +"tests/data/acpi/q35/DSDT.mmio64", +"tests/data/acpi/q35/DSDT.numamem", From df9b9b42cd2375a4eb785702899699a020ca2597 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Fri, 15 May 2020 17:04:07 +0200 Subject: [PATCH 06/58] acpi: move aml builder code for rtc device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Gerd Hoffmann Reviewed-by: Igor Mammedov Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20200515150421.25479-3-kraxel@redhat.com> --- hw/i386/acpi-build.c | 17 ----------------- hw/rtc/mc146818rtc.c | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index d996525e2c..df5417c75f 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1137,22 +1137,6 @@ static Aml *build_fdc_device_aml(ISADevice *fdc) return dev; } -static Aml *build_rtc_device_aml(void) -{ - Aml *dev; - Aml *crs; - - dev = aml_device("RTC"); - aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0B00"))); - crs = aml_resource_template(); - aml_append(crs, aml_io(AML_DECODE16, 0x0070, 0x0070, 0x10, 0x02)); - aml_append(crs, aml_irq_no_flags(8)); - aml_append(crs, aml_io(AML_DECODE16, 0x0072, 0x0072, 0x02, 0x06)); - aml_append(dev, aml_name_decl("_CRS", crs)); - - return dev; -} - static Aml *build_kbd_device_aml(void) { Aml *dev; @@ -1278,7 +1262,6 @@ static void build_isa_devices_aml(Aml *table) Aml *scope = aml_scope("_SB.PCI0.ISA"); Object *obj = object_resolve_path_type("", TYPE_ISA_BUS, &ambiguous); - aml_append(scope, build_rtc_device_aml()); aml_append(scope, build_kbd_device_aml()); aml_append(scope, build_mouse_device_aml()); if (fdc) { diff --git a/hw/rtc/mc146818rtc.c b/hw/rtc/mc146818rtc.c index 9c30cbdcd7..fe05a4488e 100644 --- a/hw/rtc/mc146818rtc.c +++ b/hw/rtc/mc146818rtc.c @@ -27,6 +27,7 @@ #include "qemu/cutils.h" #include "qemu/module.h" #include "qemu/bcd.h" +#include "hw/acpi/aml-build.h" #include "hw/irq.h" #include "hw/qdev-properties.h" #include "qemu/timer.h" @@ -1007,13 +1008,34 @@ static void rtc_resetdev(DeviceState *d) } } +static void rtc_build_aml(ISADevice *isadev, Aml *scope) +{ + Aml *dev; + Aml *crs; + + crs = aml_resource_template(); + aml_append(crs, aml_io(AML_DECODE16, RTC_ISA_BASE, RTC_ISA_BASE, + 0x10, 0x02)); + aml_append(crs, aml_irq_no_flags(RTC_ISA_IRQ)); + aml_append(crs, aml_io(AML_DECODE16, RTC_ISA_BASE + 2, RTC_ISA_BASE + 2, + 0x02, 0x06)); + + dev = aml_device("RTC"); + aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0B00"))); + aml_append(dev, aml_name_decl("_CRS", crs)); + + aml_append(scope, dev); +} + static void rtc_class_initfn(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); + ISADeviceClass *isa = ISA_DEVICE_CLASS(klass); dc->realize = rtc_realizefn; dc->reset = rtc_resetdev; dc->vmsd = &vmstate_rtc; + isa->build_aml = rtc_build_aml; device_class_set_props(dc, mc146818rtc_properties); } From f592b94f3cbd5325bcf5142509b60cb6ab252770 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Fri, 15 May 2020 17:04:08 +0200 Subject: [PATCH 07/58] acpi: rtc: use a single crs range Use a single io range for _CRS instead of two, following what real hardware does. Signed-off-by: Gerd Hoffmann Reviewed-by: Igor Mammedov Message-Id: <20200515150421.25479-4-kraxel@redhat.com> --- hw/rtc/mc146818rtc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/hw/rtc/mc146818rtc.c b/hw/rtc/mc146818rtc.c index fe05a4488e..1e9fa0f33f 100644 --- a/hw/rtc/mc146818rtc.c +++ b/hw/rtc/mc146818rtc.c @@ -1013,12 +1013,14 @@ static void rtc_build_aml(ISADevice *isadev, Aml *scope) Aml *dev; Aml *crs; + /* + * Reserving 8 io ports here, following what physical hardware + * does, even though qemu only responds to the first two ports. + */ crs = aml_resource_template(); aml_append(crs, aml_io(AML_DECODE16, RTC_ISA_BASE, RTC_ISA_BASE, - 0x10, 0x02)); + 0x01, 0x08)); aml_append(crs, aml_irq_no_flags(RTC_ISA_IRQ)); - aml_append(crs, aml_io(AML_DECODE16, RTC_ISA_BASE + 2, RTC_ISA_BASE + 2, - 0x02, 0x06)); dev = aml_device("RTC"); aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0B00"))); From 4b8e369b91deff3ae7d8ae1067d336a66464f472 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Fri, 15 May 2020 17:04:09 +0200 Subject: [PATCH 08/58] acpi: serial: don't use _STA method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The _STA method dates back to the days where we had a static DSDT. The device is listed in the DSDT table unconditionally and the _STA method checks a bit in the isa bridge pci config space to figure whenever a given is isa device is present or not, then evaluates to 0x0f (present) or 0x00 (absent). These days the DSDT is generated by qemu anyway, so if a device is not present we can simply drop it from the DSDT instead. Signed-off-by: Gerd Hoffmann Reviewed-by: Igor Mammedov Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20200515150421.25479-5-kraxel@redhat.com> --- hw/i386/acpi-build.c | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index df5417c75f..cb22cb0fe6 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1208,50 +1208,34 @@ static Aml *build_lpt_device_aml(void) return dev; } -static Aml *build_com_device_aml(uint8_t uid) +static void build_com_device_aml(Aml *scope, uint8_t uid) { Aml *dev; Aml *crs; - Aml *method; - Aml *if_ctx; - Aml *else_ctx; - Aml *zero = aml_int(0); - Aml *is_present = aml_local(0); - const char *enabled_field = "CAEN"; uint8_t irq = 4; uint16_t io_port = 0x03F8; assert(uid == 1 || uid == 2); if (uid == 2) { - enabled_field = "CBEN"; irq = 3; io_port = 0x02F8; } + if (!memory_region_present(get_system_io(), io_port)) { + return; + } dev = aml_device("COM%d", uid); aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0501"))); aml_append(dev, aml_name_decl("_UID", aml_int(uid))); - method = aml_method("_STA", 0, AML_NOTSERIALIZED); - aml_append(method, aml_store(aml_name("%s", enabled_field), is_present)); - if_ctx = aml_if(aml_equal(is_present, zero)); - { - aml_append(if_ctx, aml_return(aml_int(0x00))); - } - aml_append(method, if_ctx); - else_ctx = aml_else(); - { - aml_append(else_ctx, aml_return(aml_int(0x0f))); - } - aml_append(method, else_ctx); - aml_append(dev, method); + aml_append(dev, aml_name_decl("_STA", aml_int(0xf))); crs = aml_resource_template(); aml_append(crs, aml_io(AML_DECODE16, io_port, io_port, 0x00, 0x08)); aml_append(crs, aml_irq_no_flags(irq)); aml_append(dev, aml_name_decl("_CRS", crs)); - return dev; + aml_append(scope, dev); } static void build_isa_devices_aml(Aml *table) @@ -1268,8 +1252,8 @@ static void build_isa_devices_aml(Aml *table) aml_append(scope, build_fdc_device_aml(fdc)); } aml_append(scope, build_lpt_device_aml()); - aml_append(scope, build_com_device_aml(1)); - aml_append(scope, build_com_device_aml(2)); + build_com_device_aml(scope, 1); + build_com_device_aml(scope, 2); if (ambiguous) { error_report("Multiple ISA busses, unable to define IPMI ACPI data"); From dcdbfaafe90a5e6e172368b2aa5500a9ca192e49 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Fri, 15 May 2020 17:04:10 +0200 Subject: [PATCH 09/58] acpi: move aml builder code for serial device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code uses the isa_serial_io array to figure what the device uid is. Side effect is that acpi antries are not limited to port 1+2 any more, we'll also get entries for ports 3+4. Signed-off-by: Gerd Hoffmann Reviewed-by: Igor Mammedov Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20200515150421.25479-6-kraxel@redhat.com> --- hw/char/serial-isa.c | 22 ++++++++++++++++++++++ hw/i386/acpi-build.c | 32 -------------------------------- 2 files changed, 22 insertions(+), 32 deletions(-) diff --git a/hw/char/serial-isa.c b/hw/char/serial-isa.c index f9b6eed783..165e320e65 100644 --- a/hw/char/serial-isa.c +++ b/hw/char/serial-isa.c @@ -27,6 +27,7 @@ #include "qapi/error.h" #include "qemu/module.h" #include "sysemu/sysemu.h" +#include "hw/acpi/aml-build.h" #include "hw/char/serial.h" #include "hw/isa/isa.h" #include "hw/qdev-properties.h" @@ -81,6 +82,25 @@ static void serial_isa_realizefn(DeviceState *dev, Error **errp) isa_register_ioport(isadev, &s->io, isa->iobase); } +static void serial_isa_build_aml(ISADevice *isadev, Aml *scope) +{ + ISASerialState *isa = ISA_SERIAL(isadev); + Aml *dev; + Aml *crs; + + crs = aml_resource_template(); + aml_append(crs, aml_io(AML_DECODE16, isa->iobase, isa->iobase, 0x00, 0x08)); + aml_append(crs, aml_irq_no_flags(isa->isairq)); + + dev = aml_device("COM%d", isa->index + 1); + aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0501"))); + aml_append(dev, aml_name_decl("_UID", aml_int(isa->index + 1))); + aml_append(dev, aml_name_decl("_STA", aml_int(0xf))); + aml_append(dev, aml_name_decl("_CRS", crs)); + + aml_append(scope, dev); +} + static const VMStateDescription vmstate_isa_serial = { .name = "serial", .version_id = 3, @@ -103,9 +123,11 @@ static Property serial_isa_properties[] = { static void serial_isa_class_initfn(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); + ISADeviceClass *isa = ISA_DEVICE_CLASS(klass); dc->realize = serial_isa_realizefn; dc->vmsd = &vmstate_isa_serial; + isa->build_aml = serial_isa_build_aml; device_class_set_props(dc, serial_isa_properties); set_bit(DEVICE_CATEGORY_INPUT, dc->categories); } diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index cb22cb0fe6..6de25f6484 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1208,36 +1208,6 @@ static Aml *build_lpt_device_aml(void) return dev; } -static void build_com_device_aml(Aml *scope, uint8_t uid) -{ - Aml *dev; - Aml *crs; - uint8_t irq = 4; - uint16_t io_port = 0x03F8; - - assert(uid == 1 || uid == 2); - if (uid == 2) { - irq = 3; - io_port = 0x02F8; - } - if (!memory_region_present(get_system_io(), io_port)) { - return; - } - - dev = aml_device("COM%d", uid); - aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0501"))); - aml_append(dev, aml_name_decl("_UID", aml_int(uid))); - - aml_append(dev, aml_name_decl("_STA", aml_int(0xf))); - - crs = aml_resource_template(); - aml_append(crs, aml_io(AML_DECODE16, io_port, io_port, 0x00, 0x08)); - aml_append(crs, aml_irq_no_flags(irq)); - aml_append(dev, aml_name_decl("_CRS", crs)); - - aml_append(scope, dev); -} - static void build_isa_devices_aml(Aml *table) { ISADevice *fdc = pc_find_fdc0(); @@ -1252,8 +1222,6 @@ static void build_isa_devices_aml(Aml *table) aml_append(scope, build_fdc_device_aml(fdc)); } aml_append(scope, build_lpt_device_aml()); - build_com_device_aml(scope, 1); - build_com_device_aml(scope, 2); if (ambiguous) { error_report("Multiple ISA busses, unable to define IPMI ACPI data"); From 3e824d38256364f983be8717214fef709bc90e31 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Fri, 15 May 2020 17:04:11 +0200 Subject: [PATCH 10/58] acpi: parallel: don't use _STA method The _STA method dates back to the days where we had a static DSDT. The device is listed in the DSDT table unconditionally and the _STA method checks a bit in the isa bridge pci config space to figure whenever a given is isa device is present or not, then evaluates to 0x0f (present) or 0x00 (absent). These days the DSDT is generated by qemu anyway, so if a device is not present we can simply drop it from the DSDT instead. Signed-off-by: Gerd Hoffmann Reviewed-by: Igor Mammedov Message-Id: <20200515150421.25479-7-kraxel@redhat.com> --- hw/i386/acpi-build.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 6de25f6484..2cafad03e2 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1173,39 +1173,26 @@ static Aml *build_mouse_device_aml(void) return dev; } -static Aml *build_lpt_device_aml(void) +static void build_lpt_device_aml(Aml *scope) { Aml *dev; Aml *crs; - Aml *method; - Aml *if_ctx; - Aml *else_ctx; - Aml *zero = aml_int(0); - Aml *is_present = aml_local(0); + + if (!memory_region_present(get_system_io(), 0x0378)) { + return; + } dev = aml_device("LPT"); aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0400"))); - method = aml_method("_STA", 0, AML_NOTSERIALIZED); - aml_append(method, aml_store(aml_name("LPEN"), is_present)); - if_ctx = aml_if(aml_equal(is_present, zero)); - { - aml_append(if_ctx, aml_return(aml_int(0x00))); - } - aml_append(method, if_ctx); - else_ctx = aml_else(); - { - aml_append(else_ctx, aml_return(aml_int(0x0f))); - } - aml_append(method, else_ctx); - aml_append(dev, method); + aml_append(dev, aml_name_decl("_STA", aml_int(0xf))); crs = aml_resource_template(); aml_append(crs, aml_io(AML_DECODE16, 0x0378, 0x0378, 0x08, 0x08)); aml_append(crs, aml_irq_no_flags(7)); aml_append(dev, aml_name_decl("_CRS", crs)); - return dev; + aml_append(scope, dev); } static void build_isa_devices_aml(Aml *table) @@ -1221,7 +1208,7 @@ static void build_isa_devices_aml(Aml *table) if (fdc) { aml_append(scope, build_fdc_device_aml(fdc)); } - aml_append(scope, build_lpt_device_aml()); + build_lpt_device_aml(scope); if (ambiguous) { error_report("Multiple ISA busses, unable to define IPMI ACPI data"); From ed003c8c7743b846b1bf3f87cb88baf090da44e6 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Fri, 15 May 2020 17:04:12 +0200 Subject: [PATCH 11/58] acpi: move aml builder code for parallel device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also adds support for multiple LPT devices. Signed-off-by: Gerd Hoffmann Reviewed-by: Igor Mammedov Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20200515150421.25479-8-kraxel@redhat.com> --- hw/char/parallel.c | 22 ++++++++++++++++++++++ hw/i386/acpi-build.c | 23 ----------------------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/hw/char/parallel.c b/hw/char/parallel.c index 8dd67d1375..c0f34bf924 100644 --- a/hw/char/parallel.c +++ b/hw/char/parallel.c @@ -28,6 +28,7 @@ #include "qemu/module.h" #include "chardev/char-parallel.h" #include "chardev/char-fe.h" +#include "hw/acpi/aml-build.h" #include "hw/irq.h" #include "hw/isa/isa.h" #include "hw/qdev-properties.h" @@ -568,6 +569,25 @@ static void parallel_isa_realizefn(DeviceState *dev, Error **errp) s, "parallel"); } +static void parallel_isa_build_aml(ISADevice *isadev, Aml *scope) +{ + ISAParallelState *isa = ISA_PARALLEL(isadev); + Aml *dev; + Aml *crs; + + crs = aml_resource_template(); + aml_append(crs, aml_io(AML_DECODE16, isa->iobase, isa->iobase, 0x08, 0x08)); + aml_append(crs, aml_irq_no_flags(isa->isairq)); + + dev = aml_device("LPT%d", isa->index + 1); + aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0400"))); + aml_append(dev, aml_name_decl("_UID", aml_int(isa->index + 1))); + aml_append(dev, aml_name_decl("_STA", aml_int(0xf))); + aml_append(dev, aml_name_decl("_CRS", crs)); + + aml_append(scope, dev); +} + /* Memory mapped interface */ static uint64_t parallel_mm_readfn(void *opaque, hwaddr addr, unsigned size) { @@ -624,9 +644,11 @@ static Property parallel_isa_properties[] = { static void parallel_isa_class_initfn(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); + ISADeviceClass *isa = ISA_DEVICE_CLASS(klass); dc->realize = parallel_isa_realizefn; dc->vmsd = &vmstate_parallel_isa; + isa->build_aml = parallel_isa_build_aml; device_class_set_props(dc, parallel_isa_properties); set_bit(DEVICE_CATEGORY_INPUT, dc->categories); } diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 2cafad03e2..58fe505fb6 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1173,28 +1173,6 @@ static Aml *build_mouse_device_aml(void) return dev; } -static void build_lpt_device_aml(Aml *scope) -{ - Aml *dev; - Aml *crs; - - if (!memory_region_present(get_system_io(), 0x0378)) { - return; - } - - dev = aml_device("LPT"); - aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0400"))); - - aml_append(dev, aml_name_decl("_STA", aml_int(0xf))); - - crs = aml_resource_template(); - aml_append(crs, aml_io(AML_DECODE16, 0x0378, 0x0378, 0x08, 0x08)); - aml_append(crs, aml_irq_no_flags(7)); - aml_append(dev, aml_name_decl("_CRS", crs)); - - aml_append(scope, dev); -} - static void build_isa_devices_aml(Aml *table) { ISADevice *fdc = pc_find_fdc0(); @@ -1208,7 +1186,6 @@ static void build_isa_devices_aml(Aml *table) if (fdc) { aml_append(scope, build_fdc_device_aml(fdc)); } - build_lpt_device_aml(scope); if (ambiguous) { error_report("Multiple ISA busses, unable to define IPMI ACPI data"); From bab16ab33012cdae36246d2994b4a5ed9ac441f0 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 9 Jun 2020 11:16:14 -0400 Subject: [PATCH 12/58] tests/acpi: update DSDT expected files Update DSDT after CRS changes and _STA methods dropped. Signed-off-by: Michael S. Tsirkin --- tests/data/acpi/pc/DSDT | Bin 5125 -> 5014 bytes tests/data/acpi/pc/DSDT.acpihmat | Bin 6449 -> 6338 bytes tests/data/acpi/pc/DSDT.bridge | Bin 6984 -> 6873 bytes tests/data/acpi/pc/DSDT.cphp | Bin 5588 -> 5477 bytes tests/data/acpi/pc/DSDT.dimmpxm | Bin 6778 -> 6667 bytes tests/data/acpi/pc/DSDT.ipmikcs | Bin 5197 -> 5086 bytes tests/data/acpi/pc/DSDT.memhp | Bin 6484 -> 6373 bytes tests/data/acpi/pc/DSDT.numamem | Bin 5131 -> 5020 bytes tests/data/acpi/q35/DSDT | Bin 7863 -> 7752 bytes tests/data/acpi/q35/DSDT.acpihmat | Bin 9187 -> 9076 bytes tests/data/acpi/q35/DSDT.bridge | Bin 7880 -> 7769 bytes tests/data/acpi/q35/DSDT.cphp | Bin 8326 -> 8215 bytes tests/data/acpi/q35/DSDT.dimmpxm | Bin 9516 -> 9405 bytes tests/data/acpi/q35/DSDT.ipmibt | Bin 7938 -> 7827 bytes tests/data/acpi/q35/DSDT.memhp | Bin 9222 -> 9111 bytes tests/data/acpi/q35/DSDT.mmio64 | Bin 8993 -> 8882 bytes tests/data/acpi/q35/DSDT.numamem | Bin 7869 -> 7758 bytes tests/qtest/bios-tables-test-allowed-diff.h | 17 ----------------- 18 files changed, 17 deletions(-) diff --git a/tests/data/acpi/pc/DSDT b/tests/data/acpi/pc/DSDT index ad4b2d46cc7865e8bafcca2e4e888a03cc5483b5..384a82dbb3cb0e9f47db6f4d08945631c2b72b56 100644 GIT binary patch delta 164 zcmZqGn5NF<66_K(O_+g!v27z)FQb5;xIS}yuv2`1v!{V)uw(q>Ym5)2jC=w@3_0RG zJY9GkFR(Ch#D{viFml8Phd55Y%On+UYPPMU;J z0)_$x0VX8|Mt8;{h9U+gwn~P{%NZZm+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX>?2004S} BL(TvI diff --git a/tests/data/acpi/pc/DSDT.acpihmat b/tests/data/acpi/pc/DSDT.acpihmat index eff7aadfabe431c3ac2d28e0c6721eb6e322af66..47ddfdb027b06dc2daa46be711c3f4640ce68320 100644 GIT binary patch delta 164 zcmdmJbjXm)CDXE}*eV$&FK2vMZ|4&b63-Fu;pxKTc!7mML?S*o#F3%EC&1M& zVFBCZ1coIH3t5(M@pHsG2L%fXaPhh`RxnpEb8sj%FjO)`H@UJn`}-Qg3}9vCh!6F2 zVT2ju?13KkUL;&o^I!Tf`n nfkR1vp^_oGNi`_MIUZ&tH_%A9vI2$z21X7g2FA@z%*Vt5d=n{- delta 282 zcmcab|o6X5EX zuz+oH0>cu9g)B?B_&MU8gMtMGxOm+eE0`;oIXILW7%CZ}n_O9({e2B#2Cy=6#D{vi zFv5&+cEoMW59S}t3_cu60$_98K;{^snZpHiJ;XiEPIwGr`oYA&p~S(kS(y2lH~`2b BMNI$z diff --git a/tests/data/acpi/pc/DSDT.cphp b/tests/data/acpi/pc/DSDT.cphp index f3572358510f3fbb3d966047274f7aa7835e7bef..54f481faf1e336c0bbf5e774cd67220fe06e951b 100644 GIT binary patch delta 164 zcmcbj{Zxy~CD>tYEHS=HO6jV5nq>ZgORD_V+b}8NkZO5g+R5 z!U!|Q*%7xfKbU_oGx%^Q34qOU1DRulW)2t7^$_Ym5)2jC=w@3_0RG zJY9GkFR(Ch#D{viFml8Phd55Y%On+UYPPMU;J z0)_$x0VX8|Mt8;{h9U+gwn~P{%NZZm+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX@2v3;^?H BMW_G( diff --git a/tests/data/acpi/pc/DSDT.ipmikcs b/tests/data/acpi/pc/DSDT.ipmikcs index 469d13e1f6b873bb9cfa0b3af32d1a3bc58e8f77..57b78358744a5bb13639ccddb887be2721240807 100644 GIT binary patch delta 172 zcmX@BaZjDgCDjc!oU$9>gmGB5g#1lIQcGUHE!Nn_l delta 259 zcmcboepZ9aCDCqao0w!UPgOYWqs!OV5j&1XHNsqV8{6ACf%SA=Xj2I4^I~! z#|zvH9P!RU!GaQ8qV9|Z3tJCJk%$it zabzg)32^mGSim+pfnf>5LY5_yUolDPy0SR?`x?SbXJzDw5A}3mgz0d0MAae3EamD3 t(qV+AgNq5S#R)|V7eBb|o6X5EX zuz+oH0>cu9g)B?B_&MU8gMtMGxOm+eE0`;oIXILW7%CZ}n_O9({e2B#2Cy=6#D{vi zFv5&+cEoMW59S}t3_cu60$_98K;{^snZpHiJ;XiEPIwGr`oYA&p~S(kS(y2fC;*9S BMH&DA diff --git a/tests/data/acpi/pc/DSDT.numamem b/tests/data/acpi/pc/DSDT.numamem index 9a747f6f08f61c73b891d8f91db01521e635f811..f194bc639482eb839a875d493857526f85f1a9e0 100644 GIT binary patch delta 164 zcmeCyn4`|+66_K(N0@Ym5)2jC=w@3_0RG zJY9GkFR(Ch#D{viFml8Phd55Y%On+Ub|o6X5EX zuz+oH0>cu9g)B?B_&MU8gMtMGxOm+eE0`;oIXILW7%CZ}n_O9({e2B#2Cy=6#D{vi zFv5&+cEoMW59S}t3_cu60$_98K;{^snZpHiJ;XiEPIwGr`oYA&p~S(kS(v#~002NJ BM34Xg diff --git a/tests/data/acpi/q35/DSDT b/tests/data/acpi/q35/DSDT index 9fa4d5a405c2bcd9313b13894917622bf156013e..6a5e4dd85a7d9a95f7ad0fb95e6a4fa7a8d91adb 100644 GIT binary patch delta 164 zcmdmPd%}jxCDXE}*eV$&Yca*u+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX-^w4FJ*) BL!JNt diff --git a/tests/data/acpi/q35/DSDT.acpihmat b/tests/data/acpi/q35/DSDT.acpihmat index 2d834a854ccddc17afd0bc4b4a9e0886feff8e65..c1dd7773f3386a946fcb4a9a3bf9ad3a33ddbbe9 100644 GIT binary patch delta 164 zcmaFt{>6>UCDXE}*eV$&Yca*u+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEXzdmz(uv2`1v!{V)uw(q>NTxU`BcFf}LymY4 zPZu7?3oHyA@u8kBj2!X7A&!&pGD(FSIs5y?REWIH4xP delta 282 zcmcaXE}*eV$&Yca*u+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX?dC4FIJi BLxKPR diff --git a/tests/data/acpi/q35/DSDT.cphp b/tests/data/acpi/q35/DSDT.cphp index c59c19ff46b9bb4fa3e06e9ffbcbeba308a80cd0..74e86176e5fec46e660c567acf8fbcf08a14bdfb 100644 GIT binary patch delta 164 zcmZp3obJHo66_KpuE4;+=(~|ifJwlcU!OTX*eO21+0(!?*fD-`BvYJ}kxxK~AxFH2 zrwfnc1r`R5_)t$5MvnO45XZ@PnWVyvoc(>_(yTxUxHK0(N4#@Tu%G}JuRG%p<{!)q n97+NVl?>5MszD*n@h~H~fkwiW6)+SqFmfm{Fm7&QW|0K|t9T`# delta 282 zcmbR4(B{bH66_Mvroh0!_--SY0F$tnm_BoSuv2`1v!{V)uw#65lkVh?Od=8zT%zub z1q=lY0!&H_jP8s@3`Gn~Y?Ta?wV2}S?R)}4;yL0yJY9GkFR(C(NW=$+I5HIY1i1Po zEMS|Qz_5g2AXE}*eV$&Yca*u+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX=%05de>t BL)!oV diff --git a/tests/data/acpi/q35/DSDT.ipmibt b/tests/data/acpi/q35/DSDT.ipmibt index 3910e9b767808962b46501da51945229359e3d1d..38723daef80421ea528b2ad2d411e7357df43956 100644 GIT binary patch delta 172 zcmZp&n{3PF66_K(S&o5$kz*s50F!{5h(2?Cuv2`1v!{V)uw(q>NT#^SZOooB(M?7^ z0U?GQ@gANoJdPJw7&ziXJzW?%;)6pRC*Ng~iZpWe_k~Nd0wv(mT>KpI&OyO~0$jZA uj6axvFf(u{2{2SLL^r7hg*eB+c9V`qY67j(y zjtm7p0j_=t3)m(nFf3tM$g*VeTP7)8R~Bb~UqhJbtc)D-p`I>`Fdfd0s5%sxrCi)V rI*ibCa52HPIH73a;)l3bfQ#3i@dwinCI${A4u;7sOum~Fn2n_Y$^<{r diff --git a/tests/data/acpi/q35/DSDT.memhp b/tests/data/acpi/q35/DSDT.memhp index 8461e984c965da916d828884f6629422e83e429c..98328d1c4197ab19a71de7f3f18e2985f4910f45 100644 GIT binary patch delta 164 zcmZqknC{Ny66_K(U73M_@z_Q#0VV-&etqWnV5j&1XHNsqV8{5$kxX$?Mm_-{h8*!8 zo-RC&7g!iL;zK=M7&+pDLmVgHWs(Xva`yLyOS1wc;L=?D9P!RU!GZ!@yzY!Yn13)c na3~2dR5C<2sRo5O$HR=|1{w)hR=`lez{sJ*z__`IxkUj0OPVJw delta 282 zcmbR4-sZvO66_MfrozC$n6{BifJxX(OrJSE*eO21+0(!?*fBo3Nq6!`CJ_kU;J z0)_$x0VX8|Mt8;{h9U+gwn~P{T1;{Ec0K_i@f`6Uo-RC&7g!iXB;tcZ92p9H0$lwP z7O+iDU|7PikYx!MKS#WCP_Uo?7q2^G1#<;62ZvGvLnT9WlPim}zpo+809Hnh_)t$5 zMwl_qj<}8a!Tf`n!G}Xh0BnvM$Q&aybGU%6hq%Yt36DWcKbROelsFhR3p2MU006lR BLhJwl diff --git a/tests/data/acpi/q35/DSDT.mmio64 b/tests/data/acpi/q35/DSDT.mmio64 index fc0cc096baf8aedc0a526978ff796025d7380453..5916c0e9ce0a9607c6230f9dfebe2c1be70b2495 100644 GIT binary patch delta 164 zcmZ4Jw#k*tCDXVQ;;W5 delta 282 zcmdnwy3mcwCDzdmz(uv2`1v!{V)uw(q>NTxU`BcFf}LymY4 zPZu7?3oHyA@u8kBj2!X7A&!&pGD(FSIs5yXE}*eV$&Yca*u+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX-^z4FHR! BLtp>^ diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index 6a052c5044..dfb8523c8b 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1,18 +1 @@ /* List of comma-separated changed AML files to ignore */ -"tests/data/acpi/pc/DSDT", -"tests/data/acpi/pc/DSDT.acpihmat", -"tests/data/acpi/pc/DSDT.bridge", -"tests/data/acpi/pc/DSDT.cphp", -"tests/data/acpi/pc/DSDT.dimmpxm", -"tests/data/acpi/pc/DSDT.ipmikcs", -"tests/data/acpi/pc/DSDT.memhp", -"tests/data/acpi/pc/DSDT.numamem", -"tests/data/acpi/q35/DSDT", -"tests/data/acpi/q35/DSDT.acpihmat", -"tests/data/acpi/q35/DSDT.bridge", -"tests/data/acpi/q35/DSDT.cphp", -"tests/data/acpi/q35/DSDT.dimmpxm", -"tests/data/acpi/q35/DSDT.ipmibt", -"tests/data/acpi/q35/DSDT.memhp", -"tests/data/acpi/q35/DSDT.mmio64", -"tests/data/acpi/q35/DSDT.numamem", From 7e7c1b84ca11c36a4269baa6538a47dbdf2ee758 Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Fri, 29 May 2020 15:28:40 -0400 Subject: [PATCH 13/58] acpi: tpm: Do not build TCPA table for TPM 2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not build a TCPA table for TPM 2 anymore but create the log area when building the TPM2 table. The TCPA table is only needed for TPM 1.2. Signed-off-by: Stefan Berger Reviewed-by: Marc-André Lureau Reviewed-by: Eric Auger Reviewed-by: Igor Mammedov Signed-off-by: Michael S. Tsirkin Reviewed-by: Michael S. Tsirkin --- hw/i386/acpi-build.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 58fe505fb6..d05d010f77 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2216,6 +2216,10 @@ build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog) tpm2_ptr->log_area_minimum_length = cpu_to_le32(TPM_LOG_AREA_MINIMUM_SIZE); + acpi_data_push(tcpalog, le32_to_cpu(tpm2_ptr->log_area_minimum_length)); + bios_linker_loader_alloc(linker, ACPI_BUILD_TPMLOG_FILE, tcpalog, 1, + false); + /* log area start address to be filled by Guest linker */ bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE, log_addr_offset, log_addr_size, @@ -2752,10 +2756,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) build_hpet(tables_blob, tables->linker); } if (misc.tpm_version != TPM_VERSION_UNSPEC) { - acpi_add_table(table_offsets, tables_blob); - build_tpm_tcpa(tables_blob, tables->linker, tables->tcpalog); - - if (misc.tpm_version == TPM_VERSION_2_0) { + if (misc.tpm_version == TPM_VERSION_1_2) { + acpi_add_table(table_offsets, tables_blob); + build_tpm_tcpa(tables_blob, tables->linker, tables->tcpalog); + } else { /* TPM_VERSION_2_0 */ acpi_add_table(table_offsets, tables_blob); build_tpm2(tables_blob, tables->linker, tables->tcpalog); } From 04b778610af51987309055440b1ff16b50db5646 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 1 Jun 2020 11:57:34 +0200 Subject: [PATCH 14/58] acpi: Convert build_tpm2() to build_append* API In preparation of its move to the generic acpi code, let's convert build_tpm2() to use build_append API. This latter now is prefered in place of direct ACPI struct field settings with manual endianness conversion. Signed-off-by: Eric Auger Message-Id: <20200601095737.32671-2-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-build.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index d05d010f77..8d93a2d339 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2197,30 +2197,40 @@ build_tpm_tcpa(GArray *table_data, BIOSLinker *linker, GArray *tcpalog) static void build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog) { - Acpi20TPM2 *tpm2_ptr = acpi_data_push(table_data, sizeof *tpm2_ptr); + Acpi20TPM2 *tpm2_ptr = acpi_data_push(table_data, sizeof(AcpiTableHeader)); unsigned log_addr_size = sizeof(tpm2_ptr->log_area_start_address); unsigned log_addr_offset = (char *)&tpm2_ptr->log_area_start_address - table_data->data; + uint8_t start_method_params[12] = {}; - tpm2_ptr->platform_class = cpu_to_le16(TPM2_ACPI_CLASS_CLIENT); + /* platform class */ + build_append_int_noprefix(table_data, TPM2_ACPI_CLASS_CLIENT, 2); + /* reserved */ + build_append_int_noprefix(table_data, 0, 2); if (TPM_IS_TIS_ISA(tpm_find())) { - tpm2_ptr->control_area_address = cpu_to_le64(0); - tpm2_ptr->start_method = cpu_to_le32(TPM2_START_METHOD_MMIO); + /* address of control area */ + build_append_int_noprefix(table_data, 0, 8); + /* start method */ + build_append_int_noprefix(table_data, TPM2_START_METHOD_MMIO, 4); } else if (TPM_IS_CRB(tpm_find())) { - tpm2_ptr->control_area_address = cpu_to_le64(TPM_CRB_ADDR_CTRL); - tpm2_ptr->start_method = cpu_to_le32(TPM2_START_METHOD_CRB); + build_append_int_noprefix(table_data, TPM_CRB_ADDR_CTRL, 8); + build_append_int_noprefix(table_data, TPM2_START_METHOD_CRB, 4); } else { g_warn_if_reached(); } - tpm2_ptr->log_area_minimum_length = - cpu_to_le32(TPM_LOG_AREA_MINIMUM_SIZE); + /* platform specific parameters */ + g_array_append_vals(table_data, &start_method_params, 12); - acpi_data_push(tcpalog, le32_to_cpu(tpm2_ptr->log_area_minimum_length)); + /* log area minimum length */ + build_append_int_noprefix(table_data, TPM_LOG_AREA_MINIMUM_SIZE, 4); + + acpi_data_push(tcpalog, TPM_LOG_AREA_MINIMUM_SIZE); bios_linker_loader_alloc(linker, ACPI_BUILD_TPMLOG_FILE, tcpalog, 1, false); /* log area start address to be filled by Guest linker */ + build_append_int_noprefix(table_data, 0, 8); bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE, log_addr_offset, log_addr_size, ACPI_BUILD_TPMLOG_FILE, 0); From 4338416064303aad7929a247770f2ed4d9652966 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 1 Jun 2020 11:57:35 +0200 Subject: [PATCH 15/58] acpi: Move build_tpm2() in the generic part We plan to build the TPM2 table on ARM too. In order to reuse the generation code, let's move build_tpm2() to aml-build.c. No change in the implementation. Signed-off-by: Eric Auger Message-Id: <20200601095737.32671-3-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/aml-build.c | 44 +++++++++++++++++++++++++++++++++++++ hw/i386/acpi-build.c | 44 ------------------------------------- include/hw/acpi/aml-build.h | 2 ++ 3 files changed, 46 insertions(+), 44 deletions(-) diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index 3681ec6e3d..b37052c1b4 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -26,6 +26,7 @@ #include "qemu/bitops.h" #include "sysemu/numa.h" #include "hw/boards.h" +#include "hw/acpi/tpm.h" static GArray *build_alloc_array(void) { @@ -1877,6 +1878,49 @@ build_hdr: "FACP", tbl->len - fadt_start, f->rev, oem_id, oem_table_id); } +void build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog) +{ + Acpi20TPM2 *tpm2_ptr = acpi_data_push(table_data, sizeof(AcpiTableHeader)); + unsigned log_addr_size = sizeof(tpm2_ptr->log_area_start_address); + unsigned log_addr_offset = + (char *)&tpm2_ptr->log_area_start_address - table_data->data; + uint8_t start_method_params[12] = {}; + + /* platform class */ + build_append_int_noprefix(table_data, TPM2_ACPI_CLASS_CLIENT, 2); + /* reserved */ + build_append_int_noprefix(table_data, 0, 2); + if (TPM_IS_TIS_ISA(tpm_find())) { + /* address of control area */ + build_append_int_noprefix(table_data, 0, 8); + /* start method */ + build_append_int_noprefix(table_data, TPM2_START_METHOD_MMIO, 4); + } else if (TPM_IS_CRB(tpm_find())) { + build_append_int_noprefix(table_data, TPM_CRB_ADDR_CTRL, 8); + build_append_int_noprefix(table_data, TPM2_START_METHOD_CRB, 4); + } else { + g_warn_if_reached(); + } + + /* platform specific parameters */ + g_array_append_vals(table_data, &start_method_params, 12); + + /* log area minimum length */ + build_append_int_noprefix(table_data, TPM_LOG_AREA_MINIMUM_SIZE, 4); + + acpi_data_push(tcpalog, TPM_LOG_AREA_MINIMUM_SIZE); + bios_linker_loader_alloc(linker, ACPI_BUILD_TPMLOG_FILE, tcpalog, 1, + false); + + /* log area start address to be filled by Guest linker */ + build_append_int_noprefix(table_data, 0, 8); + bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE, + log_addr_offset, log_addr_size, + ACPI_BUILD_TPMLOG_FILE, 0); + build_header(linker, table_data, + (void *)tpm2_ptr, "TPM2", sizeof(*tpm2_ptr), 4, NULL, NULL); +} + /* ACPI 5.0: 6.4.3.8.2 Serial Bus Connection Descriptors */ static Aml *aml_serial_bus_device(uint8_t serial_bus_type, uint8_t flags, uint16_t type_flags, diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 8d93a2d339..1ecb68f45f 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2194,50 +2194,6 @@ build_tpm_tcpa(GArray *table_data, BIOSLinker *linker, GArray *tcpalog) (void *)tcpa, "TCPA", sizeof(*tcpa), 2, NULL, NULL); } -static void -build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog) -{ - Acpi20TPM2 *tpm2_ptr = acpi_data_push(table_data, sizeof(AcpiTableHeader)); - unsigned log_addr_size = sizeof(tpm2_ptr->log_area_start_address); - unsigned log_addr_offset = - (char *)&tpm2_ptr->log_area_start_address - table_data->data; - uint8_t start_method_params[12] = {}; - - /* platform class */ - build_append_int_noprefix(table_data, TPM2_ACPI_CLASS_CLIENT, 2); - /* reserved */ - build_append_int_noprefix(table_data, 0, 2); - if (TPM_IS_TIS_ISA(tpm_find())) { - /* address of control area */ - build_append_int_noprefix(table_data, 0, 8); - /* start method */ - build_append_int_noprefix(table_data, TPM2_START_METHOD_MMIO, 4); - } else if (TPM_IS_CRB(tpm_find())) { - build_append_int_noprefix(table_data, TPM_CRB_ADDR_CTRL, 8); - build_append_int_noprefix(table_data, TPM2_START_METHOD_CRB, 4); - } else { - g_warn_if_reached(); - } - - /* platform specific parameters */ - g_array_append_vals(table_data, &start_method_params, 12); - - /* log area minimum length */ - build_append_int_noprefix(table_data, TPM_LOG_AREA_MINIMUM_SIZE, 4); - - acpi_data_push(tcpalog, TPM_LOG_AREA_MINIMUM_SIZE); - bios_linker_loader_alloc(linker, ACPI_BUILD_TPMLOG_FILE, tcpalog, 1, - false); - - /* log area start address to be filled by Guest linker */ - build_append_int_noprefix(table_data, 0, 8); - bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE, - log_addr_offset, log_addr_size, - ACPI_BUILD_TPMLOG_FILE, 0); - build_header(linker, table_data, - (void *)tpm2_ptr, "TPM2", sizeof(*tpm2_ptr), 4, NULL, NULL); -} - #define HOLE_640K_START (640 * KiB) #define HOLE_640K_END (1 * MiB) diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h index ed7c89309e..d27da03d64 100644 --- a/include/hw/acpi/aml-build.h +++ b/include/hw/acpi/aml-build.h @@ -437,4 +437,6 @@ void build_slit(GArray *table_data, BIOSLinker *linker, MachineState *ms); void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f, const char *oem_id, const char *oem_table_id); + +void build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog); #endif From 80bde69353924d99e2a7f5ac6be0ab4cf489d33c Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 1 Jun 2020 11:57:36 +0200 Subject: [PATCH 16/58] arm/acpi: TPM2 ACPI table support Add a TPM2 ACPI table if a TPM2.0 sysbus device has been dynamically instantiated. Signed-off-by: Eric Auger Message-Id: <20200601095737.32671-4-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/aml-build.c | 5 +++-- hw/arm/virt-acpi-build.c | 7 +++++++ include/sysemu/tpm.h | 2 ++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index b37052c1b4..d24e9e6c3a 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -1885,17 +1885,18 @@ void build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog) unsigned log_addr_offset = (char *)&tpm2_ptr->log_area_start_address - table_data->data; uint8_t start_method_params[12] = {}; + TPMIf *tpmif = tpm_find(); /* platform class */ build_append_int_noprefix(table_data, TPM2_ACPI_CLASS_CLIENT, 2); /* reserved */ build_append_int_noprefix(table_data, 0, 2); - if (TPM_IS_TIS_ISA(tpm_find())) { + if (TPM_IS_TIS_ISA(tpmif) || TPM_IS_TIS_SYSBUS(tpmif)) { /* address of control area */ build_append_int_noprefix(table_data, 0, 8); /* start method */ build_append_int_noprefix(table_data, TPM2_START_METHOD_MMIO, 4); - } else if (TPM_IS_CRB(tpm_find())) { + } else if (TPM_IS_CRB(tpmif)) { build_append_int_noprefix(table_data, TPM_CRB_ADDR_CTRL, 8); build_append_int_noprefix(table_data, TPM2_START_METHOD_CRB, 4); } else { diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 2cbccd5fe2..ca31f70f7f 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -41,12 +41,14 @@ #include "hw/acpi/pci.h" #include "hw/acpi/memory_hotplug.h" #include "hw/acpi/generic_event_device.h" +#include "hw/acpi/tpm.h" #include "hw/pci/pcie_host.h" #include "hw/pci/pci.h" #include "hw/arm/virt.h" #include "hw/mem/nvdimm.h" #include "sysemu/numa.h" #include "sysemu/reset.h" +#include "sysemu/tpm.h" #include "kvm_arm.h" #include "migration/vmstate.h" #include "hw/acpi/ghes.h" @@ -848,6 +850,11 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) build_iort(tables_blob, tables->linker, vms); } + if (tpm_get_version(tpm_find()) == TPM_VERSION_2_0) { + acpi_add_table(table_offsets, tables_blob); + build_tpm2(tables_blob, tables->linker, tables->tcpalog); + } + /* XSDT is pointed to by RSDP */ xsdt = tables_blob->len; build_xsdt(tables_blob, tables->linker, table_offsets, NULL, NULL); diff --git a/include/sysemu/tpm.h b/include/sysemu/tpm.h index f37851b1aa..03fb25941c 100644 --- a/include/sysemu/tpm.h +++ b/include/sysemu/tpm.h @@ -50,6 +50,8 @@ typedef struct TPMIfClass { #define TPM_IS_TIS_ISA(chr) \ object_dynamic_cast(OBJECT(chr), TYPE_TPM_TIS_ISA) +#define TPM_IS_TIS_SYSBUS(chr) \ + object_dynamic_cast(OBJECT(chr), TYPE_TPM_TIS_SYSBUS) #define TPM_IS_CRB(chr) \ object_dynamic_cast(OBJECT(chr), TYPE_TPM_CRB) #define TPM_IS_SPAPR(chr) \ From 266345a867c290d6664c008b3bb3945395eca821 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 9 Jun 2020 14:54:05 +0200 Subject: [PATCH 17/58] test/tpm-emu: include sockets and channel headers in tpm-emu header Include sockets and channel headers to that the header is self-contained. Signed-off-by: Eric Auger Reviewed-by: Stefan Berger Message-Id: <20200609125409.24179-2-eric.auger@redhat.com> --- tests/qtest/tpm-emu.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/qtest/tpm-emu.h b/tests/qtest/tpm-emu.h index a4f1d64226..73f3bed0c4 100644 --- a/tests/qtest/tpm-emu.h +++ b/tests/qtest/tpm-emu.h @@ -16,6 +16,9 @@ #define TPM_RC_FAILURE 0x101 #define TPM2_ST_NO_SESSIONS 0x8001 +#include "qemu/sockets.h" +#include "io/channel.h" + struct tpm_hdr { uint16_t tag; uint32_t len; From 6d6d1a23fc2ed42ab6ac76e289000474def8ff0c Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 9 Jun 2020 14:54:06 +0200 Subject: [PATCH 18/58] tests/acpi: Add void tables for Q35/TPM-TIS bios-tables-test Add placeholders for TPM and DSDT reference tables for Q35 TPM-TIS tests and ignore them for the time being. Signed-off-by: Eric Auger Reviewed-by: Stefan Berger Reviewed-by: Igor Mammedov Message-Id: <20200609125409.24179-3-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/data/acpi/q35/DSDT.tis | 0 tests/data/acpi/q35/TPM2.tis | 0 tests/qtest/bios-tables-test-allowed-diff.h | 2 ++ 3 files changed, 2 insertions(+) create mode 100644 tests/data/acpi/q35/DSDT.tis create mode 100644 tests/data/acpi/q35/TPM2.tis diff --git a/tests/data/acpi/q35/DSDT.tis b/tests/data/acpi/q35/DSDT.tis new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/q35/TPM2.tis b/tests/data/acpi/q35/TPM2.tis new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..a2a45d1d31 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,3 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/q35/DSDT.tis", +"tests/data/acpi/q35/TPM2.tis", From c7504b9f32f2ee6ddcb2b0d69219046b3fa48c2d Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 9 Jun 2020 14:54:07 +0200 Subject: [PATCH 19/58] tests: tpm-emu: Remove assert on TPM2_ST_NO_SESSIONS bios-tables-test executes SeaBIOS. Indeed FW is needed to fetch tables from QEMU and put them into the guest RAM. Also the FW patches cross table pointers. At some point, SeaBIOS ends up calling the TPM2_CC_HierarchyControl command with TPM2_ST_SESSIONS tag, most probably steming from tpm_set_failure/tpm20_hierarchycontrol SeaBIOS call path. This causes an assert() in the qtest tpm emulation code. As the goal here is not to boot SeaBIOS completely but just let it grab the ACPI tables and consolidate them, let's just remove the assert(). Signed-off-by: Eric Auger Message-Id: <20200609125409.24179-4-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/qtest/tpm-emu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/qtest/tpm-emu.c b/tests/qtest/tpm-emu.c index c43ac4aef8..298d0eec74 100644 --- a/tests/qtest/tpm-emu.c +++ b/tests/qtest/tpm-emu.c @@ -49,7 +49,6 @@ static void *tpm_emu_tpm_thread(void *data) s->tpm_msg->tag = be16_to_cpu(s->tpm_msg->tag); s->tpm_msg->len = be32_to_cpu(s->tpm_msg->len); g_assert_cmpint(s->tpm_msg->len, >=, minhlen); - g_assert_cmpint(s->tpm_msg->tag, ==, TPM2_ST_NO_SESSIONS); s->tpm_msg = g_realloc(s->tpm_msg, s->tpm_msg->len); qio_channel_read(ioc, (char *)&s->tpm_msg->code, From 5da7c35e25a4cba351602774e30cb59173af3c73 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 9 Jun 2020 14:54:08 +0200 Subject: [PATCH 20/58] bios-tables-test: Add Q35/TPM-TIS test Test tables specific to the TPM-TIS instantiation. The TPM2 is added in the framework. Also the DSDT is updated with the TPM. The new function should be be usable for CRB as well, later one. Signed-off-by: Eric Auger Message-Id: <20200609125409.24179-5-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/qtest/Makefile.include | 1 + tests/qtest/bios-tables-test.c | 58 ++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/tests/qtest/Makefile.include b/tests/qtest/Makefile.include index 9e5a51d033..5023fa413d 100644 --- a/tests/qtest/Makefile.include +++ b/tests/qtest/Makefile.include @@ -262,6 +262,7 @@ tests/qtest/hd-geo-test$(EXESUF): tests/qtest/hd-geo-test.o $(libqos-obj-y) tests/qtest/boot-order-test$(EXESUF): tests/qtest/boot-order-test.o $(libqos-obj-y) tests/qtest/boot-serial-test$(EXESUF): tests/qtest/boot-serial-test.o $(libqos-obj-y) tests/qtest/bios-tables-test$(EXESUF): tests/qtest/bios-tables-test.o \ + tests/qtest/tpm-emu.o $(test-io-obj-y) \ tests/qtest/boot-sector.o tests/qtest/acpi-utils.o $(libqos-obj-y) tests/qtest/pxe-test$(EXESUF): tests/qtest/pxe-test.o tests/qtest/boot-sector.o $(libqos-obj-y) tests/qtest/microbit-test$(EXESUF): tests/qtest/microbit-test.o diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c index c9843829b3..53f104a9c5 100644 --- a/tests/qtest/bios-tables-test.c +++ b/tests/qtest/bios-tables-test.c @@ -57,6 +57,9 @@ #include "qemu/bitmap.h" #include "acpi-utils.h" #include "boot-sector.h" +#include "tpm-emu.h" +#include "hw/acpi/tpm.h" + #define MACHINE_PC "pc" #define MACHINE_Q35 "q35" @@ -874,6 +877,60 @@ static void test_acpi_piix4_tcg_numamem(void) free_test_data(&data); } +uint64_t tpm_tis_base_addr; + +static void test_acpi_tcg_tpm(const char *machine, const char *tpm_if, + uint64_t base) +{ + gchar *tmp_dir_name = g_strdup_printf("qemu-test_acpi_%s_tcg_%s.XXXXXX", + machine, tpm_if); + char *tmp_path = g_dir_make_tmp(tmp_dir_name, NULL); + TestState test; + test_data data; + GThread *thread; + char *args, *variant = g_strdup_printf(".%s", tpm_if); + + tpm_tis_base_addr = base; + + module_call_init(MODULE_INIT_QOM); + + test.addr = g_new0(SocketAddress, 1); + test.addr->type = SOCKET_ADDRESS_TYPE_UNIX; + test.addr->u.q_unix.path = g_build_filename(tmp_path, "sock", NULL); + g_mutex_init(&test.data_mutex); + g_cond_init(&test.data_cond); + test.data_cond_signal = false; + + thread = g_thread_new(NULL, tpm_emu_ctrl_thread, &test); + tpm_emu_test_wait_cond(&test); + + memset(&data, 0, sizeof(data)); + data.machine = machine; + data.variant = variant; + + args = g_strdup_printf( + " -chardev socket,id=chr,path=%s" + " -tpmdev emulator,id=dev,chardev=chr" + " -device tpm-%s,tpmdev=dev", + test.addr->u.q_unix.path, tpm_if); + + test_acpi_one(args, &data); + + g_thread_join(thread); + g_unlink(test.addr->u.q_unix.path); + qapi_free_SocketAddress(test.addr); + g_rmdir(tmp_path); + g_free(variant); + g_free(tmp_path); + g_free(tmp_dir_name); + free_test_data(&data); +} + +static void test_acpi_q35_tcg_tpm_tis(void) +{ + test_acpi_tcg_tpm("q35", "tis", 0xFED40000); +} + static void test_acpi_tcg_dimm_pxm(const char *machine) { test_data data; @@ -1037,6 +1094,7 @@ int main(int argc, char *argv[]) return ret; } + qtest_add_func("acpi/q35/tpm-tis", test_acpi_q35_tcg_tpm_tis); qtest_add_func("acpi/piix4", test_acpi_piix4_tcg); qtest_add_func("acpi/piix4/bridge", test_acpi_piix4_tcg_bridge); qtest_add_func("acpi/q35", test_acpi_q35_tcg); From cae98d8c86b7c76eae1c389ff7b9bd98143f656d Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 9 Jun 2020 14:54:09 +0200 Subject: [PATCH 21/58] bios-tables-test: Generate reference tables for Q35/TPM-TIS TPM2, DSDT tables were generated using tests/data/acpi/rebuild-expected-aml.sh Signed-off-by: Eric Auger Reviewed-by: Stefan Berger Message-Id: <20200609125409.24179-6-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/data/acpi/q35/DSDT.tis | Bin 0 -> 8357 bytes tests/data/acpi/q35/TPM2.tis | Bin 0 -> 76 bytes tests/qtest/bios-tables-test-allowed-diff.h | 2 -- 3 files changed, 2 deletions(-) diff --git a/tests/data/acpi/q35/DSDT.tis b/tests/data/acpi/q35/DSDT.tis index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..56b6fb0c3298517d080e38fea05a748b9f1dba54 100644 GIT binary patch literal 8357 zcmb7JON=AM8Lskc+U@pC+vC|cthEqD1O%QLASe+DcDG-d@l21~y9B(Edp3~F?nc4_ zR)VaMW(JAJN{IpnluINpa-v+KTyhV}IR_3affVJCYouHfMXYjI=liQ%yH&PFVtr`$ zU;TamUtjg(>#p)czw_H7WBd=5ZLbj)D?bc8A8nj5M*H;dHWL?Fci*qndpr|M@a{oI z>|;~03Xl1f^L75~z37uDdi~vq@AlcHPu-6%?e^b(FS^7CbnQ~#wrR)RuzO*p*FE&? zz-yMQUZd$d<UnZ6ZuQE3n_0c{O`jJw7kb;Fm$-42dH(aO!>#0CFSl&F-PYgM zzdC#7xlf*dz5Lm;U;O08RXYK|8vZT(ulH#aU61Uu(a`;HaGAeFba8BTxcuQj9F}b^ z9qC8Za?_DblxlT!)ood_uT`rNR6DA#96dzeFdmJd(_#)|pWn%_MEAg@c=p`9Df!3Z3Z1bCQ#LweMqPkpsLI(SGTxV!DchS#@$$YBCaY@weI1Q8x`Ma#8WrJne$#cvl)Ec_As&XeL z=ik7(cDBynh`$lHtaJM!^BNT0m1{6GAn3$HCAwX_~-pSirinmz-csG@1Nqz4ceU`zwz3BK9j(SWSd(XRVMd2_G>Kf zP`e#NP**`5L4)P931>XUeI;>|h*$tF%432Ds0xXR`~n*jOkgJwF%>k%1QSrfL`Y2Z zMquNDsd_+{g2*bMamfT!g~U`O4YSXVTD_6ifw8N~Y>OLS1Ld z(3uiU1x=Yc5$ZY@44n&x&IMB^LS3h2=(G%-mZ=k=t`j$_HqvQBXWG<>P}gZ2I&DLz zZR$j*>&zHBGltHLsS}~D6AumTd^(0s$JB{X*O@hRW(}QLQzt@Q=c1u=(a^bQ>O`pP zTrzYn89J9tod|WEuA$R4bh@Tagu2e0p)+Ub%$Yh7>N@j=&b*;BZ|X#->s&T;E*mu}HkhYP zCPJNg#$cW?m}g8TLY?`5!F<4AK43Bt>dXfPQ@PB8f~lVI4@#zba3dt9a+0$~&9g?$ zvt~_%vZe(CEo%l^51V%l6#FPVF;E03D}y)!RTKs)al(=~;}WPGGbW)bB&L!E1C>~1 zpc<46R6+%l3=|=HqYPAHClS?DUP}flp?3`wA-XQGFi?q|M8s5sO9m>Tf=LF7Q0at$ zN~~bQKs6{CsDuh887M-f69y`=f(ZlFpk$yDDwt%T2$fD4sKg2;3{-=Xfl8=gl7S*r zI$@v^E0{1)4N3+op@K;Uicsl$|V8TE(C>f}P z3MLsSLZuT1DzSnI1J$5ppb{#WWS|I@P8g`f3MLFxgOY(ts9=(TB2+qIpb{&XFi;Ij z1}dR~Nd}5g>4bqwtYE@GH7FUVgbF4ZC_<$Z1}d?F2?N!jWS|l%m}H;`l};F_#0n-1 zRD+U%N~mCxfg)5oVW1K#m@rTcN(L&Sf=LF7Q0at$N~~bQKs6{CsDuh887M-f69y`= zf(ZlFpk$yDDwt%T2$fD4sKg2;3{-=Xfl8=gl7S*rI$@v^E0{1)4N3+op@K;Uicsl< zfg;ij6p?PA2z3KRs2Qlngn?>IGEj|42C6Y(pc)efsxiqxH6|IT#)N@tOco$ zWS|-o2C6Y(pc<15RAZ8XYD^d?B6W6QponnB#9)$vBE)G>Au-k1!axzJv4w#mQe#U7 zicpO$87M-Z6;AbFp={_3#K-E0K1gq-=&#&8_|6k?`bwqGcA6OAL3^hR$K9#W#-ojI zF-%kJRB69Pn>uY8R$;lh6L$D8ecj&Q!+b<~WM>LbHg~*s8h~~ww`gbHPq8%33*m$M z2DRBqY@7Y-d-f8eJ#?atNs7${yd#v=H@$urnaMwuq1S!gk1S>`nSW<*9vAy)soDa|BV5!EZ`pI*IEm|wlpJ#3M7Z>qW-yN~joR^Aiky{Yov5#_x}d4IY*W^BH;dS5H= zi}LORWXweodQzCKmHenk2DqdZ)oKkqI_ejeB+4n4U|vNqinjoE6b~g z2VN_1yUWQpv#pR4w>F}mAlTNmo>D<%W?D$gHs$gva>j+L9ljH6tX5!_d!^L}3+CjS-s;kt- z%?WSHdcpZ(1Gt`vuJo~Fnxe}-?x{d?l)(SY;KIALMus%?ynG^PK69}^A{S6 zUg&LEN6FyPr{&Bb;^=yxO((?&59^5;LE=ncxTfJ4eBPhw8PI(m-P}kF8?&jz`JA81 zj?d--9J%HTZKb{Ku;Uyhbnf|@=`0lYFVE4{w`0?-1idodh#{EUyYAc1&{c(-4?kR> zxU#GfU0=8cTjfqJV_OWL`t%Z5Si}-tCqrV`-sLNW)3|)=LA6&ct>FZT6(qWp>8!d1 zIwEem$wggynz^l@A@%svgZ##G6 zELLF{`_H?0I>_VnVmThuizWSHOdJuftq860c>G6kfU(~jM?M>m|9nRt`Rsc)9=(@M zwow9b$53zlD8`B%4&S=N0(xj+l`a9fa*;C|Yz3t<&$r(99os8h_|*cT(AmjouIar9;QG1cn|YBBRr~caFxC4u;A96 zJvKqFa-VsD7*UjIvYMVxMIh!0)Ca@Nkx5$c5(Rnu8q015${Ln{!SnA z+AS;se;?rZa0Lt1eQJM?dcqlc)mj@qG`NgJs(GC=+HVnL9hU%M)`u4OiIE)x_==tE z)7S`r1P8O=)Cl#eNuU>w>?6J`MR)BOfnDmH)@$l5`XyT+lPfJJk<>DijzpeOF3#ET qDLIV)>O|jO{_C~cug=sjeDUvFKN_aRFvfX&xQszm{D@BGi2Wba Date: Wed, 20 May 2020 12:04:37 +0200 Subject: [PATCH 22/58] virtio-balloon: fix free page hinting without an iothread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case we don't have an iothread, we mark the feature as abscent but still add the queue. 'free_page_bh' remains set to NULL. qemu-system-i386 \ -M microvm \ -nographic \ -device virtio-balloon-device,free-page-hint=true \ -nographic \ -display none \ -monitor none \ -serial none \ -qtest stdio Doing a "write 0xc0000e30 0x24 0x030000000300000003000000030000000300000003000000030000000300000003000000" We will trigger a SEGFAULT. Let's move the check and bail out. While at it, move the static initializations to instance_init(). free_page_report_status and block_iothread are implicitly set to the right values (0/false) already, so drop the initialization. Reviewed-by: Alexander Duyck Reviewed-by: Philippe Mathieu-Daudé Fixes: c13c4153f76d ("virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT") Reported-by: Alexander Bulekov Cc: qemu-stable@nongnu.org Cc: Wei Wang Cc: Michael S. Tsirkin Cc: Philippe Mathieu-Daudé Cc: Alexander Duyck Signed-off-by: David Hildenbrand Message-Id: <20200520100439.19872-2-david@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-balloon.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 065cd450f1..7ff6a7aa7c 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -789,6 +789,13 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) return; } + if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_FREE_PAGE_HINT) && + !s->iothread) { + error_setg(errp, "'free-page-hint' requires 'iothread' to be set"); + virtio_cleanup(vdev); + return; + } + s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); @@ -797,24 +804,11 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, virtio_balloon_handle_free_page_vq); - s->free_page_report_status = FREE_PAGE_REPORT_S_STOP; - s->free_page_report_cmd_id = - VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN; - s->free_page_report_notify.notify = - virtio_balloon_free_page_report_notify; precopy_add_notifier(&s->free_page_report_notify); - if (s->iothread) { - object_ref(OBJECT(s->iothread)); - s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread), - virtio_ballloon_get_free_page_hints, s); - qemu_mutex_init(&s->free_page_lock); - qemu_cond_init(&s->free_page_cond); - s->block_iothread = false; - } else { - /* Simply disable this feature if the iothread wasn't created. */ - s->host_features &= ~(1 << VIRTIO_BALLOON_F_FREE_PAGE_HINT); - virtio_error(vdev, "iothread is missing"); - } + + object_ref(OBJECT(s->iothread)); + s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread), + virtio_ballloon_get_free_page_hints, s); } reset_stats(s); } @@ -892,6 +886,11 @@ static void virtio_balloon_instance_init(Object *obj) { VirtIOBalloon *s = VIRTIO_BALLOON(obj); + qemu_mutex_init(&s->free_page_lock); + qemu_cond_init(&s->free_page_cond); + s->free_page_report_cmd_id = VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN; + s->free_page_report_notify.notify = virtio_balloon_free_page_report_notify; + object_property_add(obj, "guest-stats", "guest statistics", balloon_stats_get_all, NULL, NULL, s); From 49b01711b8eb3796c6904c7f85d2431572cfe54f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 May 2020 12:04:38 +0200 Subject: [PATCH 23/58] virtio-balloon: fix free page hinting check on unrealize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Checking against guest features is wrong. We allocated data structures based on host features. We can rely on "free_page_bh" as an indicator whether to un-do stuff instead. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Alexander Duyck Fixes: c13c4153f76d ("virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT") Cc: qemu-stable@nongnu.org Cc: Wei Wang Cc: Michael S. Tsirkin Cc: Philippe Mathieu-Daudé Cc: Alexander Duyck Signed-off-by: David Hildenbrand Message-Id: <20200520100439.19872-3-david@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 7ff6a7aa7c..32e9fe3f64 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -818,7 +818,7 @@ static void virtio_balloon_device_unrealize(DeviceState *dev) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOBalloon *s = VIRTIO_BALLOON(dev); - if (virtio_balloon_free_page_support(s)) { + if (s->free_page_bh) { qemu_bh_delete(s->free_page_bh); virtio_balloon_free_page_stop(s); precopy_remove_notifier(&s->free_page_report_notify); From 105aef9c9479786d27c1c45c9b0b1fa03dc46be3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 May 2020 12:04:39 +0200 Subject: [PATCH 24/58] virtio-balloon: unref the iothread when unrealizing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We took a reference when realizing, so let's drop that reference when unrealizing. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Alexander Duyck Fixes: c13c4153f76d ("virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT") Cc: qemu-stable@nongnu.org Cc: Wei Wang Cc: Alexander Duyck Cc: Michael S. Tsirkin Cc: Philippe Mathieu-Daudé Signed-off-by: David Hildenbrand Message-Id: <20200520100439.19872-4-david@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-balloon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 32e9fe3f64..cff8eab6a1 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -820,6 +820,7 @@ static void virtio_balloon_device_unrealize(DeviceState *dev) if (s->free_page_bh) { qemu_bh_delete(s->free_page_bh); + object_unref(OBJECT(s->iothread)); virtio_balloon_free_page_stop(s); precopy_remove_notifier(&s->free_page_report_notify); } From 7483cbbaf8204de330b7362339a4a070b114b425 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 26 May 2020 21:14:00 -0700 Subject: [PATCH 25/58] virtio-balloon: Implement support for page poison reporting feature We need to make certain to advertise support for page poison reporting if we want to actually get data on if the guest will be poisoning pages. Add a value for reporting the poison value being used if page poisoning is enabled in the guest. With this we can determine if we will need to skip free page reporting when it is enabled in the future. The value currently has no impact on existing balloon interfaces. In the case of existing balloon interfaces the onus is on the guest driver to reapply whatever poison is in place. When we add free page reporting the poison value is used to determine if we can perform in-place page reporting. The expectation is that a reported page will already contain the value specified by the poison, and the reporting of the page should not change that value. Acked-by: David Hildenbrand Signed-off-by: Alexander Duyck Message-Id: <20200527041400.12700.33251.stgit@localhost.localdomain> --- hw/core/machine.c | 4 +++- hw/virtio/virtio-balloon.c | 29 +++++++++++++++++++++++++++++ include/hw/virtio/virtio-balloon.h | 1 + 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index bb3a7b18b1..9eca7d8c9b 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -28,7 +28,9 @@ #include "hw/mem/nvdimm.h" #include "migration/vmstate.h" -GlobalProperty hw_compat_5_0[] = {}; +GlobalProperty hw_compat_5_0[] = { + { "virtio-balloon-device", "page-poison", "false" }, +}; const size_t hw_compat_5_0_len = G_N_ELEMENTS(hw_compat_5_0); GlobalProperty hw_compat_4_2[] = { diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index cff8eab6a1..31d3c88482 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -634,6 +634,7 @@ static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) config.num_pages = cpu_to_le32(dev->num_pages); config.actual = cpu_to_le32(dev->actual); + config.poison_val = cpu_to_le32(dev->poison_val); if (dev->free_page_report_status == FREE_PAGE_REPORT_S_REQUESTED) { config.free_page_report_cmd_id = @@ -683,6 +684,14 @@ static ram_addr_t get_current_ram_size(void) return size; } +static bool virtio_balloon_page_poison_support(void *opaque) +{ + VirtIOBalloon *s = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON); +} + static void virtio_balloon_set_config(VirtIODevice *vdev, const uint8_t *config_data) { @@ -697,6 +706,10 @@ static void virtio_balloon_set_config(VirtIODevice *vdev, qapi_event_send_balloon_change(vm_ram_size - ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT)); } + dev->poison_val = 0; + if (virtio_balloon_page_poison_support(dev)) { + dev->poison_val = le32_to_cpu(config.poison_val); + } trace_virtio_balloon_set_config(dev->actual, oldactual); } @@ -755,6 +768,17 @@ static const VMStateDescription vmstate_virtio_balloon_free_page_report = { } }; +static const VMStateDescription vmstate_virtio_balloon_page_poison = { + .name = "vitio-balloon-device/page-poison", + .version_id = 1, + .minimum_version_id = 1, + .needed = virtio_balloon_page_poison_support, + .fields = (VMStateField[]) { + VMSTATE_UINT32(poison_val, VirtIOBalloon), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_virtio_balloon_device = { .name = "virtio-balloon-device", .version_id = 1, @@ -767,6 +791,7 @@ static const VMStateDescription vmstate_virtio_balloon_device = { }, .subsections = (const VMStateDescription * []) { &vmstate_virtio_balloon_free_page_report, + &vmstate_virtio_balloon_page_poison, NULL } }; @@ -849,6 +874,8 @@ static void virtio_balloon_device_reset(VirtIODevice *vdev) g_free(s->stats_vq_elem); s->stats_vq_elem = NULL; } + + s->poison_val = 0; } static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status) @@ -916,6 +943,8 @@ static Property virtio_balloon_properties[] = { VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false), DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features, VIRTIO_BALLOON_F_FREE_PAGE_HINT, false), + DEFINE_PROP_BIT("page-poison", VirtIOBalloon, host_features, + VIRTIO_BALLOON_F_PAGE_POISON, true), /* QEMU 4.0 accidentally changed the config size even when free-page-hint * is disabled, resulting in QEMU 3.1 migration incompatibility. This * property retains this quirk for QEMU 4.1 machine types. diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h index d1c968d237..7fe78e5c14 100644 --- a/include/hw/virtio/virtio-balloon.h +++ b/include/hw/virtio/virtio-balloon.h @@ -70,6 +70,7 @@ typedef struct VirtIOBalloon { uint32_t host_features; bool qemu_4_0_config_size; + uint32_t poison_val; } VirtIOBalloon; #endif From 91b867191ddf73c64f51ca2ddb489b922ed4058e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 26 May 2020 21:14:07 -0700 Subject: [PATCH 26/58] virtio-balloon: Provide an interface for free page reporting Add support for free page reporting. The idea is to function very similar to how the balloon works in that we basically end up madvising the page as not being used. However we don't really need to bother with any deflate type logic since the page will be faulted back into the guest when it is read or written to. This provides a new way of letting the guest proactively report free pages to the hypervisor, so the hypervisor can reuse them. In contrast to inflate/deflate that is triggered via the hypervisor explicitly. Acked-by: David Hildenbrand Signed-off-by: Alexander Duyck Message-Id: <20200527041407.12700.73735.stgit@localhost.localdomain> --- hw/virtio/virtio-balloon.c | 72 ++++++++++++++++++++++++++++++ include/hw/virtio/virtio-balloon.h | 2 +- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 31d3c88482..10507b2a43 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -321,6 +321,67 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v, balloon_stats_change_timer(s, 0); } +static void virtio_balloon_handle_report(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); + VirtQueueElement *elem; + + while ((elem = virtqueue_pop(vq, sizeof(VirtQueueElement)))) { + unsigned int i; + + /* + * When we discard the page it has the effect of removing the page + * from the hypervisor itself and causing it to be zeroed when it + * is returned to us. So we must not discard the page if it is + * accessible by another device or process, or if the guest is + * expecting it to retain a non-zero value. + */ + if (qemu_balloon_is_inhibited() || dev->poison_val) { + goto skip_element; + } + + for (i = 0; i < elem->in_num; i++) { + void *addr = elem->in_sg[i].iov_base; + size_t size = elem->in_sg[i].iov_len; + ram_addr_t ram_offset; + RAMBlock *rb; + + /* + * There is no need to check the memory section to see if + * it is ram/readonly/romd like there is for handle_output + * below. If the region is not meant to be written to then + * address_space_map will have allocated a bounce buffer + * and it will be freed in address_space_unmap and trigger + * and unassigned_mem_write before failing to copy over the + * buffer. If more than one bad descriptor is provided it + * will return NULL after the first bounce buffer and fail + * to map any resources. + */ + rb = qemu_ram_block_from_host(addr, false, &ram_offset); + if (!rb) { + trace_virtio_balloon_bad_addr(elem->in_addr[i]); + continue; + } + + /* + * For now we will simply ignore unaligned memory regions, or + * regions that overrun the end of the RAMBlock. + */ + if (!QEMU_IS_ALIGNED(ram_offset | size, qemu_ram_pagesize(rb)) || + (ram_offset + size) > qemu_ram_get_used_length(rb)) { + continue; + } + + ram_block_discard_range(rb, ram_offset, size); + } + +skip_element: + virtqueue_push(vq, elem, 0); + virtio_notify(vdev, vq); + g_free(elem); + } +} + static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtIOBalloon *s = VIRTIO_BALLOON(vdev); @@ -835,6 +896,12 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread), virtio_ballloon_get_free_page_hints, s); } + + if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_REPORTING)) { + s->reporting_vq = virtio_add_queue(vdev, 32, + virtio_balloon_handle_report); + } + reset_stats(s); } @@ -858,6 +925,9 @@ static void virtio_balloon_device_unrealize(DeviceState *dev) if (s->free_page_vq) { virtio_delete_queue(s->free_page_vq); } + if (s->reporting_vq) { + virtio_delete_queue(s->reporting_vq); + } virtio_cleanup(vdev); } @@ -945,6 +1015,8 @@ static Property virtio_balloon_properties[] = { VIRTIO_BALLOON_F_FREE_PAGE_HINT, false), DEFINE_PROP_BIT("page-poison", VirtIOBalloon, host_features, VIRTIO_BALLOON_F_PAGE_POISON, true), + DEFINE_PROP_BIT("free-page-reporting", VirtIOBalloon, host_features, + VIRTIO_BALLOON_F_REPORTING, false), /* QEMU 4.0 accidentally changed the config size even when free-page-hint * is disabled, resulting in QEMU 3.1 migration incompatibility. This * property retains this quirk for QEMU 4.1 machine types. diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h index 7fe78e5c14..d49fef00ce 100644 --- a/include/hw/virtio/virtio-balloon.h +++ b/include/hw/virtio/virtio-balloon.h @@ -42,7 +42,7 @@ enum virtio_balloon_free_page_report_status { typedef struct VirtIOBalloon { VirtIODevice parent_obj; - VirtQueue *ivq, *dvq, *svq, *free_page_vq; + VirtQueue *ivq, *dvq, *svq, *free_page_vq, *reporting_vq; uint32_t free_page_report_status; uint32_t num_pages; uint32_t actual; From b963ea19f896e34e21275ff69fa4565948f703c8 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Sun, 7 Jun 2020 07:20:22 +0200 Subject: [PATCH 27/58] MAINTAINERS: Fix the classification of bios-tables-test-allowed-diff.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The file tests/qtest/bios-tables-test-allowed-diff.h is currently only assigned to the qtest section according MAINTAINERS. However, this file normally only gets updated when the ACPI tables changed - something the qtest maintainers don't have much clue of. Thus this file should rather be assigned to the ACPI maintainers instead. Signed-off-by: Thomas Huth Message-Id: <20200607052022.12222-1-thuth@redhat.com> Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6e7890ce82..0c5ed09ad5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1531,7 +1531,7 @@ F: hw/acpi/* F: hw/smbios/* F: hw/i386/acpi-build.[hc] F: hw/arm/virt-acpi-build.c -F: tests/qtest/bios-tables-test.c +F: tests/qtest/bios-tables-test* F: tests/qtest/acpi-utils.[hc] F: tests/data/acpi/ @@ -2321,6 +2321,7 @@ S: Maintained F: qtest.c F: accel/qtest.c F: tests/qtest/ +X: tests/qtest/bios-tables-test-allowed-diff.h Device Fuzzing M: Alexander Bulekov From 0dabc0f6544f2c0310546f6d6cf3b68979580a9c Mon Sep 17 00:00:00 2001 From: Julia Suvorova Date: Thu, 4 Jun 2020 14:59:46 +0200 Subject: [PATCH 28/58] hw/pci/pcie: Move hot plug capability check to pre_plug callback Check for hot plug capability earlier to avoid removing devices attached during the initialization process. Run qemu with an unattached drive: -drive file=$FILE,if=none,id=drive0 \ -device pcie-root-port,id=rp0,slot=3,bus=pcie.0,hotplug=off Hotplug a block device: device_add virtio-blk-pci,id=blk0,drive=drive0,bus=rp0 If hotplug fails on plug_cb, drive0 will be deleted. Fixes: 0501e1aa1d32a6 ("hw/pci/pcie: Forbid hot-plug if it's disabled on the slot") Acked-by: Igor Mammedov Signed-off-by: Julia Suvorova Message-Id: <20200604125947.881210-1-jusual@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pcie.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index f50e10b8fb..5b9c022d91 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -407,6 +407,17 @@ static void pcie_cap_slot_plug_common(PCIDevice *hotplug_dev, DeviceState *dev, void pcie_cap_slot_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { + PCIDevice *hotplug_pdev = PCI_DEVICE(hotplug_dev); + uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap; + uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP); + + /* Check if hot-plug is disabled on the slot */ + if (dev->hotplugged && (sltcap & PCI_EXP_SLTCAP_HPC) == 0) { + error_setg(errp, "Hot-plug failed: unsupported by the port device '%s'", + DEVICE(hotplug_pdev)->id); + return; + } + pcie_cap_slot_plug_common(PCI_DEVICE(hotplug_dev), dev, errp); } @@ -415,7 +426,6 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, { PCIDevice *hotplug_pdev = PCI_DEVICE(hotplug_dev); uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap; - uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP); PCIDevice *pci_dev = PCI_DEVICE(dev); /* Don't send event when device is enabled during qemu machine creation: @@ -431,13 +441,6 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, return; } - /* Check if hot-plug is disabled on the slot */ - if ((sltcap & PCI_EXP_SLTCAP_HPC) == 0) { - error_setg(errp, "Hot-plug failed: unsupported by the port device '%s'", - DEVICE(hotplug_pdev)->id); - return; - } - /* To enable multifunction hot-plug, we just ensure the function * 0 added last. When function 0 is added, we set the sltsta and * inform OS via event notification. From f7d6a635fa3b7797f9d072e280f065bf3cfcd24d Mon Sep 17 00:00:00 2001 From: Prasad J Pandit Date: Thu, 4 Jun 2020 17:05:25 +0530 Subject: [PATCH 29/58] pci: assert configuration access is within bounds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While accessing PCI configuration bytes, assert that 'address + len' is within PCI configuration space. Generally it is within bounds. This is more of a defensive assert, in case a buggy device was to send 'address' which may go out of bounds. Suggested-by: Philippe Mathieu-Daudé Signed-off-by: Prasad J Pandit Message-Id: <20200604113525.58898-1-ppandit@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pci.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 70c66965f5..7bf2ae6d92 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -1381,6 +1381,8 @@ uint32_t pci_default_read_config(PCIDevice *d, { uint32_t val = 0; + assert(address + len <= pci_config_size(d)); + if (pci_is_express_downstream_port(d) && ranges_overlap(address, len, d->exp.exp_cap + PCI_EXP_LNKSTA, 2)) { pcie_sync_bridge_lnk(d); @@ -1394,6 +1396,8 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int int i, was_irq_disabled = pci_irq_disabled(d); uint32_t val = val_in; + assert(addr + l <= pci_config_size(d)); + for (i = 0; i < l; val >>= 8, ++i) { uint8_t wmask = d->wmask[addr + i]; uint8_t w1cmask = d->w1cmask[addr + i]; From ea2fe4dfe4e5d32e69ae749a3b9f287aaf8b898e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Mon, 1 Jun 2020 16:29:24 +0200 Subject: [PATCH 30/58] hw/pci-host/prep: Correct RAVEN bus bridge memory region size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit memory_region_set_size() handle the 16 Exabytes limit by special-casing the UINT64_MAX value. This is not a problem for the 32-bit maximum, 4 GiB. By using the UINT32_MAX value, the bm-raven MemoryRegion ends up missing 1 byte: $ qemu-system-ppc -M prep -S -monitor stdio -usb memory-region: bm-raven 0000000000000000-00000000fffffffe (prio 0, i/o): bm-raven 0000000000000000-000000003effffff (prio 0, i/o): alias bm-pci-memory @pci-memory 0000000000000000-000000003effffff 0000000080000000-00000000ffffffff (prio 0, i/o): alias bm-system @system 0000000000000000-000000007fffffff Fix by using the correct value. We now have: memory-region: bm-raven 0000000000000000-00000000ffffffff (prio 0, i/o): bm-raven 0000000000000000-000000003effffff (prio 0, i/o): alias bm-pci-memory @pci-memory 0000000000000000-000000003effffff 0000000080000000-00000000ffffffff (prio 0, i/o): alias bm-system @system 0000000000000000-000000007fffffff Reviewed-by: Peter Maydell Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200601142930.29408-3-f4bug@amsat.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Richard Henderson --- hw/pci-host/prep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/pci-host/prep.c b/hw/pci-host/prep.c index 1a02e9a670..88e2fc66a9 100644 --- a/hw/pci-host/prep.c +++ b/hw/pci-host/prep.c @@ -294,7 +294,7 @@ static void raven_pcihost_initfn(Object *obj) &s->pci_memory, &s->pci_io, 0, TYPE_PCI_BUS); /* Bus master address space */ - memory_region_init(&s->bm, obj, "bm-raven", UINT32_MAX); + memory_region_init(&s->bm, obj, "bm-raven", 4 * GiB); memory_region_init_alias(&s->bm_pci_memory_alias, obj, "bm-pci-memory", &s->pci_memory, 0, memory_region_size(&s->pci_memory)); From 2dc48da25547bf82dee81f4e60766509b28e736e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Mon, 1 Jun 2020 16:29:25 +0200 Subject: [PATCH 31/58] hw/pci/pci_bridge: Correct pci_bridge_io memory region size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit memory_region_set_size() handle the 16 Exabytes limit by special-casing the UINT64_MAX value. This is not a problem for the 32-bit maximum, 4 GiB. By using the UINT32_MAX value, the pci_bridge_io MemoryRegion ends up missing 1 byte: (qemu) info mtree memory-region: pci_bridge_io 0000000000000000-00000000fffffffe (prio 0, i/o): pci_bridge_io 0000000000000060-0000000000000060 (prio 0, i/o): i8042-data 0000000000000064-0000000000000064 (prio 0, i/o): i8042-cmd 00000000000001ce-00000000000001d1 (prio 0, i/o): vbe 0000000000000378-000000000000037f (prio 0, i/o): parallel 00000000000003b4-00000000000003b5 (prio 0, i/o): vga ... Fix by using the correct value. We now have: memory-region: pci_bridge_io 0000000000000000-00000000ffffffff (prio 0, i/o): pci_bridge_io 0000000000000060-0000000000000060 (prio 0, i/o): i8042-data 0000000000000064-0000000000000064 (prio 0, i/o): i8042-cmd ... Reviewed-by: Peter Maydell Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200601142930.29408-4-f4bug@amsat.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Richard Henderson --- hw/pci/pci_bridge.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c index 97967d12eb..3ba3203f72 100644 --- a/hw/pci/pci_bridge.c +++ b/hw/pci/pci_bridge.c @@ -30,6 +30,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "hw/pci/pci_bridge.h" #include "hw/pci/pci_bus.h" #include "qemu/module.h" @@ -381,7 +382,7 @@ void pci_bridge_initfn(PCIDevice *dev, const char *typename) memory_region_init(&br->address_space_mem, OBJECT(br), "pci_bridge_pci", UINT64_MAX); sec_bus->address_space_io = &br->address_space_io; memory_region_init(&br->address_space_io, OBJECT(br), "pci_bridge_io", - UINT32_MAX); + 4 * GiB); br->windows = pci_bridge_region_init(br); QLIST_INIT(&sec_bus->child); QLIST_INSERT_HEAD(&parent->child, sec_bus, sibling); From 37e7211cae59d090eafdf26fb2b21ca41df7aed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Mon, 1 Jun 2020 16:29:26 +0200 Subject: [PATCH 32/58] hw/pci/pci_bridge: Use the IEC binary prefix definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IEC binary prefixes ease code review: the unit is explicit. Reviewed-by: Peter Maydell Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200601142930.29408-5-f4bug@amsat.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Richard Henderson --- hw/pci/pci_bridge.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c index 3ba3203f72..3789c17edc 100644 --- a/hw/pci/pci_bridge.c +++ b/hw/pci/pci_bridge.c @@ -423,14 +423,14 @@ int pci_bridge_qemu_reserve_cap_init(PCIDevice *dev, int cap_offset, } if (res_reserve.mem_non_pref != (uint64_t)-1 && - res_reserve.mem_non_pref >= (1ULL << 32)) { + res_reserve.mem_non_pref >= 4 * GiB) { error_setg(errp, "PCI resource reserve cap: mem-reserve must be less than 4G"); return -EINVAL; } if (res_reserve.mem_pref_32 != (uint64_t)-1 && - res_reserve.mem_pref_32 >= (1ULL << 32)) { + res_reserve.mem_pref_32 >= 4 * GiB) { error_setg(errp, "PCI resource reserve cap: pref32-reserve must be less than 4G"); return -EINVAL; From 51eae1e7e427ef9efc7c10c52b33575fe685add3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Mon, 1 Jun 2020 16:29:27 +0200 Subject: [PATCH 33/58] hw/pci-host: Use the IEC binary prefix definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IEC binary prefixes ease code review: the unit is explicit. Reviewed-by: Peter Maydell Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200601142930.29408-6-f4bug@amsat.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Richard Henderson --- hw/pci-host/i440fx.c | 3 ++- hw/pci-host/q35.c | 2 +- hw/pci-host/versatile.c | 5 +++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c index 0adbd77553..aefb416c8f 100644 --- a/hw/pci-host/i440fx.c +++ b/hw/pci-host/i440fx.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "qemu/range.h" #include "hw/i386/pc.h" #include "hw/pci/pci.h" @@ -301,7 +302,7 @@ PCIBus *i440fx_init(const char *host_type, const char *pci_type, memory_region_set_enabled(&f->smram_region, true); /* smram, as seen by SMM CPUs */ - memory_region_init(&f->smram, OBJECT(d), "smram", 1ull << 32); + memory_region_init(&f->smram, OBJECT(d), "smram", 4 * GiB); memory_region_set_enabled(&f->smram, true); memory_region_init_alias(&f->low_smram, OBJECT(d), "smram-low", f->ram_memory, 0xa0000, 0x20000); diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c index 352aeecfa7..b788f17b2c 100644 --- a/hw/pci-host/q35.c +++ b/hw/pci-host/q35.c @@ -589,7 +589,7 @@ static void mch_realize(PCIDevice *d, Error **errp) memory_region_set_enabled(&mch->open_high_smram, false); /* smram, as seen by SMM CPUs */ - memory_region_init(&mch->smram, OBJECT(mch), "smram", 1ull << 32); + memory_region_init(&mch->smram, OBJECT(mch), "smram", 4 * GiB); memory_region_set_enabled(&mch->smram, true); memory_region_init_alias(&mch->low_smram, OBJECT(mch), "smram-low", mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, diff --git a/hw/pci-host/versatile.c b/hw/pci-host/versatile.c index cfb9a78ea6..8ddfb8772a 100644 --- a/hw/pci-host/versatile.c +++ b/hw/pci-host/versatile.c @@ -8,6 +8,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "hw/sysbus.h" #include "migration/vmstate.h" #include "hw/irq.h" @@ -399,8 +400,8 @@ static void pci_vpb_realize(DeviceState *dev, Error **errp) pci_map_irq_fn mapfn; int i; - memory_region_init(&s->pci_io_space, OBJECT(s), "pci_io", 1ULL << 32); - memory_region_init(&s->pci_mem_space, OBJECT(s), "pci_mem", 1ULL << 32); + memory_region_init(&s->pci_io_space, OBJECT(s), "pci_io", 4 * GiB); + memory_region_init(&s->pci_mem_space, OBJECT(s), "pci_mem", 4 * GiB); pci_root_bus_new_inplace(&s->pci_bus, sizeof(s->pci_bus), dev, "pci", &s->pci_mem_space, &s->pci_io_space, From 271094474b65de1ad7aaf729938de3d9b9d0d36f Mon Sep 17 00:00:00 2001 From: Dima Stepanov Date: Thu, 28 May 2020 12:11:18 +0300 Subject: [PATCH 34/58] char-socket: return -1 in case of disconnect during tcp_chr_write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During testing of the vhost-user-blk reconnect functionality the qemu SIGSEGV was triggered: start qemu as: x86_64-softmmu/qemu-system-x86_64 -m 1024M -M q35 \ -object memory-backend-file,id=ram-node0,size=1024M,mem-path=/dev/shm/qemu,share=on \ -numa node,cpus=0,memdev=ram-node0 \ -chardev socket,id=chardev0,path=./vhost.sock,noserver,reconnect=1 \ -device vhost-user-blk-pci,chardev=chardev0,num-queues=4 --enable-kvm start vhost-user-blk daemon: ./vhost-user-blk -s ./vhost.sock -b test-img.raw If vhost-user-blk will be killed during the vhost initialization process, for instance after getting VHOST_SET_VRING_CALL command, then QEMU will fail with the following backtrace: Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault. 0x00005555559272bb in vhost_user_read (dev=0x7fffef2d53e0, msg=0x7fffffffd5b0) at ./hw/virtio/vhost-user.c:260 260 CharBackend *chr = u->user->chr; #0 0x00005555559272bb in vhost_user_read (dev=0x7fffef2d53e0, msg=0x7fffffffd5b0) at ./hw/virtio/vhost-user.c:260 #1 0x000055555592acb8 in vhost_user_get_config (dev=0x7fffef2d53e0, config=0x7fffef2d5394 "", config_len=60) at ./hw/virtio/vhost-user.c:1645 #2 0x0000555555925525 in vhost_dev_get_config (hdev=0x7fffef2d53e0, config=0x7fffef2d5394 "", config_len=60) at ./hw/virtio/vhost.c:1490 #3 0x00005555558cc46b in vhost_user_blk_device_realize (dev=0x7fffef2d51a0, errp=0x7fffffffd8f0) at ./hw/block/vhost-user-blk.c:429 #4 0x0000555555920090 in virtio_device_realize (dev=0x7fffef2d51a0, errp=0x7fffffffd948) at ./hw/virtio/virtio.c:3615 #5 0x0000555555a9779c in device_set_realized (obj=0x7fffef2d51a0, value=true, errp=0x7fffffffdb88) at ./hw/core/qdev.c:891 ... The problem is that vhost_user_write doesn't get an error after disconnect and try to call vhost_user_read(). The tcp_chr_write() routine should return -1 in case of disconnect. Indicate the EIO error if this routine is called in the disconnected state. Signed-off-by: Dima Stepanov Reviewed-by: Marc-André Lureau Message-Id: Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- chardev/char-socket.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/chardev/char-socket.c b/chardev/char-socket.c index db253d4024..18e762643b 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -175,15 +175,16 @@ static int tcp_chr_write(Chardev *chr, const uint8_t *buf, int len) if (ret < 0 && errno != EAGAIN) { if (tcp_chr_read_poll(chr) <= 0) { + /* Perform disconnect and return error. */ tcp_chr_disconnect_locked(chr); - return len; } /* else let the read handler finish it properly */ } return ret; } else { - /* XXX: indicate an error ? */ - return len; + /* Indicate an error. */ + errno = EIO; + return -1; } } From 4bcad76f4c390f798198a9a93fbaa1a20d06b73e Mon Sep 17 00:00:00 2001 From: Dima Stepanov Date: Thu, 28 May 2020 12:11:19 +0300 Subject: [PATCH 35/58] vhost-user-blk: delay vhost_user_blk_disconnect A socket write during vhost-user communication may trigger a disconnect event, calling vhost_user_blk_disconnect() and clearing all the vhost_dev structures holding data that vhost-user functions expect to remain valid to roll back initialization correctly. Delay the cleanup to keep vhost_dev structure valid. There are two possible states to handle: 1. RUN_STATE_PRELAUNCH: skip bh oneshot call and perform disconnect in the caller routine. 2. RUN_STATE_RUNNING: delay by using bh BH changes are based on the similar changes for the vhost-user-net device: commit e7c83a885f865128ae3cf1946f8cb538b63cbfba "vhost-user: delay vhost_user_stop" Signed-off-by: Dima Stepanov Message-Id: <69b73b94dcd066065595266c852810e0863a0895.1590396396.git.dimastep@yandex-team.ru> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Signed-off-by: Li Feng Reviewed-by: Raphael Norwitz --- hw/block/vhost-user-blk.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index 9d8c0b3909..76838e76d3 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -349,6 +349,19 @@ static void vhost_user_blk_disconnect(DeviceState *dev) vhost_dev_cleanup(&s->dev); } +static void vhost_user_blk_event(void *opaque, QEMUChrEvent event); + +static void vhost_user_blk_chr_closed_bh(void *opaque) +{ + DeviceState *dev = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserBlk *s = VHOST_USER_BLK(vdev); + + vhost_user_blk_disconnect(dev); + qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event, + NULL, opaque, NULL, true); +} + static void vhost_user_blk_event(void *opaque, QEMUChrEvent event) { DeviceState *dev = opaque; @@ -363,7 +376,30 @@ static void vhost_user_blk_event(void *opaque, QEMUChrEvent event) } break; case CHR_EVENT_CLOSED: - vhost_user_blk_disconnect(dev); + /* + * A close event may happen during a read/write, but vhost + * code assumes the vhost_dev remains setup, so delay the + * stop & clear. There are two possible paths to hit this + * disconnect event: + * 1. When VM is in the RUN_STATE_PRELAUNCH state. The + * vhost_user_blk_device_realize() is a caller. + * 2. In tha main loop phase after VM start. + * + * For p2 the disconnect event will be delayed. We can't + * do the same for p1, because we are not running the loop + * at this moment. So just skip this step and perform + * disconnect in the caller function. + * + * TODO: maybe it is a good idea to make the same fix + * for other vhost-user devices. + */ + if (runstate_is_running()) { + AioContext *ctx = qemu_get_current_aio_context(); + + qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, NULL, NULL, + NULL, NULL, false); + aio_bh_schedule_oneshot(ctx, vhost_user_blk_chr_closed_bh, opaque); + } break; case CHR_EVENT_BREAK: case CHR_EVENT_MUX_IN: From ece99091c2d0aeb23734289a50ef2ff4e0a08929 Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Thu, 21 May 2020 05:00:26 +0000 Subject: [PATCH 36/58] Add helper to populate vhost-user message regions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When setting vhost-user memory tables, memory region descriptors must be copied from the vhost_dev struct to the vhost-user message. To avoid duplicating code in setting the memory tables, we should use a helper to populate this field. This change adds this helper. Signed-off-by: Raphael Norwitz Message-Id: <1588533678-23450-2-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Marc-André Lureau --- hw/virtio/vhost-user.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index ec21e8fbe8..2e0552dd74 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -407,6 +407,15 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base, return 0; } +static void vhost_user_fill_msg_region(VhostUserMemoryRegion *dst, + struct vhost_memory_region *src) +{ + assert(src != NULL && dst != NULL); + dst->userspace_addr = src->userspace_addr; + dst->memory_size = src->memory_size; + dst->guest_phys_addr = src->guest_phys_addr; +} + static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u, struct vhost_dev *dev, VhostUserMsg *msg, @@ -417,6 +426,7 @@ static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u, ram_addr_t offset; MemoryRegion *mr; struct vhost_memory_region *reg; + VhostUserMemoryRegion region_buffer; msg->hdr.request = VHOST_USER_SET_MEM_TABLE; @@ -441,12 +451,8 @@ static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u, error_report("Failed preparing vhost-user memory table msg"); return -1; } - msg->payload.memory.regions[*fd_num].userspace_addr = - reg->userspace_addr; - msg->payload.memory.regions[*fd_num].memory_size = - reg->memory_size; - msg->payload.memory.regions[*fd_num].guest_phys_addr = - reg->guest_phys_addr; + vhost_user_fill_msg_region(®ion_buffer, reg); + msg->payload.memory.regions[*fd_num] = region_buffer; msg->payload.memory.regions[*fd_num].mmap_offset = offset; fds[(*fd_num)++] = fd; } else if (track_ramblocks) { From 23374a84c5f08e20ec2506a6322330d51f9134c5 Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Thu, 21 May 2020 05:00:29 +0000 Subject: [PATCH 37/58] Add vhost-user helper to get MemoryRegion data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When setting the memory tables, qemu uses a memory region's userspace address to look up the region's MemoryRegion struct. Among other things, the MemoryRegion contains the region's offset and associated file descriptor, all of which need to be sent to the backend. With VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS, this logic will be needed in multiple places, so before feature support is added it should be moved to a helper function. This helper is also used to simplify the vhost_user_can_merge() function. Signed-off-by: Raphael Norwitz Message-Id: <1588533678-23450-3-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Marc-André Lureau --- hw/virtio/vhost-user.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 2e0552dd74..442b0d650a 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -407,6 +407,18 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base, return 0; } +static MemoryRegion *vhost_user_get_mr_data(uint64_t addr, ram_addr_t *offset, + int *fd) +{ + MemoryRegion *mr; + + assert((uintptr_t)addr == addr); + mr = memory_region_from_host((void *)(uintptr_t)addr, offset); + *fd = memory_region_get_fd(mr); + + return mr; +} + static void vhost_user_fill_msg_region(VhostUserMemoryRegion *dst, struct vhost_memory_region *src) { @@ -433,10 +445,7 @@ static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u, for (i = 0; i < dev->mem->nregions; ++i) { reg = dev->mem->regions + i; - assert((uintptr_t)reg->userspace_addr == reg->userspace_addr); - mr = memory_region_from_host((void *)(uintptr_t)reg->userspace_addr, - &offset); - fd = memory_region_get_fd(mr); + mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd); if (fd > 0) { if (track_ramblocks) { assert(*fd_num < VHOST_MEMORY_MAX_NREGIONS); @@ -1551,13 +1560,9 @@ static bool vhost_user_can_merge(struct vhost_dev *dev, { ram_addr_t offset; int mfd, rfd; - MemoryRegion *mr; - mr = memory_region_from_host((void *)(uintptr_t)start1, &offset); - mfd = memory_region_get_fd(mr); - - mr = memory_region_from_host((void *)(uintptr_t)start2, &offset); - rfd = memory_region_get_fd(mr); + (void)vhost_user_get_mr_data(start1, &offset, &mfd); + (void)vhost_user_get_mr_data(start2, &offset, &rfd); return mfd == rfd; } From 6b0eff1a4ea47c835a7d8bee88c05c47ada37495 Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Thu, 21 May 2020 05:00:32 +0000 Subject: [PATCH 38/58] Add VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change introduces a new feature to the vhost-user protocol allowing a backend device to specify the maximum number of ram slots it supports. At this point, the value returned by the backend will be capped at the maximum number of ram slots which can be supported by vhost-user, which is currently set to 8 because of underlying protocol limitations. The returned value will be stored inside the VhostUserState struct so that on device reconnect we can verify that the ram slot limitation has not decreased since the last time the device connected. Signed-off-by: Raphael Norwitz Signed-off-by: Peter Turschmid Message-Id: <1588533678-23450-4-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Marc-André Lureau --- docs/interop/vhost-user.rst | 16 +++++++++++ hw/virtio/vhost-user.c | 49 ++++++++++++++++++++++++++++++++-- include/hw/virtio/vhost-user.h | 1 + 3 files changed, 64 insertions(+), 2 deletions(-) diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index 3b1b6602c7..b3cf5c3cb5 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -815,6 +815,7 @@ Protocol features #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 #define VHOST_USER_PROTOCOL_F_RESET_DEVICE 13 #define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14 + #define VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS 15 Master message types -------------------- @@ -1263,6 +1264,21 @@ Master message types The state.num field is currently reserved and must be set to 0. +``VHOST_USER_GET_MAX_MEM_SLOTS`` + :id: 36 + :equivalent ioctl: N/A + :slave payload: u64 + + When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol + feature has been successfully negotiated, this message is submitted + by master to the slave. The slave should return the message with a + u64 payload containing the maximum number of memory slots for + QEMU to expose to the guest. At this point, the value returned + by the backend will be capped at the maximum number of ram slots + which can be supported by vhost-user. Currently that limit is set + at VHOST_USER_MAX_RAM_SLOTS = 8 because of underlying protocol + limitations. + Slave message types ------------------- diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 442b0d650a..754ad885cf 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -59,6 +59,8 @@ enum VhostUserProtocolFeature { VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13, + /* Feature 14 reserved for VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. */ + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, VHOST_USER_PROTOCOL_F_MAX }; @@ -100,6 +102,8 @@ typedef enum VhostUserRequest { VHOST_USER_SET_INFLIGHT_FD = 32, VHOST_USER_GPU_SET_SOCKET = 33, VHOST_USER_RESET_DEVICE = 34, + /* Message number 35 reserved for VHOST_USER_VRING_KICK. */ + VHOST_USER_GET_MAX_MEM_SLOTS = 36, VHOST_USER_MAX } VhostUserRequest; @@ -895,6 +899,23 @@ static int vhost_user_set_owner(struct vhost_dev *dev) return 0; } +static int vhost_user_get_max_memslots(struct vhost_dev *dev, + uint64_t *max_memslots) +{ + uint64_t backend_max_memslots; + int err; + + err = vhost_user_get_u64(dev, VHOST_USER_GET_MAX_MEM_SLOTS, + &backend_max_memslots); + if (err < 0) { + return err; + } + + *max_memslots = backend_max_memslots; + + return 0; +} + static int vhost_user_reset_device(struct vhost_dev *dev) { VhostUserMsg msg = { @@ -1392,7 +1413,7 @@ static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier, static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque) { - uint64_t features, protocol_features; + uint64_t features, protocol_features, ram_slots; struct vhost_user *u; int err; @@ -1454,6 +1475,27 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque) "slave-req protocol features."); return -1; } + + /* get max memory regions if backend supports configurable RAM slots */ + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS)) { + u->user->memory_slots = VHOST_MEMORY_MAX_NREGIONS; + } else { + err = vhost_user_get_max_memslots(dev, &ram_slots); + if (err < 0) { + return err; + } + + if (ram_slots < u->user->memory_slots) { + error_report("The backend specified a max ram slots limit " + "of %" PRIu64", when the prior validated limit was %d. " + "This limit should never decrease.", ram_slots, + u->user->memory_slots); + return -1; + } + + u->user->memory_slots = MIN(ram_slots, VHOST_MEMORY_MAX_NREGIONS); + } } if (dev->migration_blocker == NULL && @@ -1519,7 +1561,9 @@ static int vhost_user_get_vq_index(struct vhost_dev *dev, int idx) static int vhost_user_memslots_limit(struct vhost_dev *dev) { - return VHOST_MEMORY_MAX_NREGIONS; + struct vhost_user *u = dev->opaque; + + return u->user->memory_slots; } static bool vhost_user_requires_shm_log(struct vhost_dev *dev) @@ -1904,6 +1948,7 @@ bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp) return false; } user->chr = chr; + user->memory_slots = 0; return true; } diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h index 811e325f42..a9abca3288 100644 --- a/include/hw/virtio/vhost-user.h +++ b/include/hw/virtio/vhost-user.h @@ -20,6 +20,7 @@ typedef struct VhostUserHostNotifier { typedef struct VhostUserState { CharBackend *chr; VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX]; + int memory_slots; } VhostUserState; bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp); From f1aeb14b0809e313c74244d838645ed25e85ea63 Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Thu, 21 May 2020 05:00:35 +0000 Subject: [PATCH 39/58] Transmit vhost-user memory regions individually MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz Signed-off-by: Swapnil Ingle Signed-off-by: Peter Turschmid Suggested-by: Mike Cui Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Marc-André Lureau --- docs/interop/vhost-user.rst | 33 ++- hw/virtio/vhost-user.c | 516 ++++++++++++++++++++++++++++++------ 2 files changed, 472 insertions(+), 77 deletions(-) diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index b3cf5c3cb5..037eefab0e 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -1276,8 +1276,37 @@ Master message types QEMU to expose to the guest. At this point, the value returned by the backend will be capped at the maximum number of ram slots which can be supported by vhost-user. Currently that limit is set - at VHOST_USER_MAX_RAM_SLOTS = 8 because of underlying protocol - limitations. + at VHOST_USER_MAX_RAM_SLOTS = 8. + +``VHOST_USER_ADD_MEM_REG`` + :id: 37 + :equivalent ioctl: N/A + :slave payload: memory region + + When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol + feature has been successfully negotiated, this message is submitted + by the master to the slave. The message payload contains a memory + region descriptor struct, describing a region of guest memory which + the slave device must map in. When the + ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has + been successfully negotiated, along with the + ``VHOST_USER_REM_MEM_REG`` message, this message is used to set and + update the memory tables of the slave device. + +``VHOST_USER_REM_MEM_REG`` + :id: 38 + :equivalent ioctl: N/A + :slave payload: memory region + + When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol + feature has been successfully negotiated, this message is submitted + by the master to the slave. The message payload contains a memory + region descriptor struct, describing a region of guest memory which + the slave device must unmap. When the + ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has + been successfully negotiated, along with the + ``VHOST_USER_ADD_MEM_REG`` message, this message is used to set and + update the memory tables of the slave device. Slave message types ------------------- diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 754ad885cf..3640f017a2 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -104,6 +104,8 @@ typedef enum VhostUserRequest { VHOST_USER_RESET_DEVICE = 34, /* Message number 35 reserved for VHOST_USER_VRING_KICK. */ VHOST_USER_GET_MAX_MEM_SLOTS = 36, + VHOST_USER_ADD_MEM_REG = 37, + VHOST_USER_REM_MEM_REG = 38, VHOST_USER_MAX } VhostUserRequest; @@ -128,6 +130,11 @@ typedef struct VhostUserMemory { VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; } VhostUserMemory; +typedef struct VhostUserMemRegMsg { + uint32_t padding; + VhostUserMemoryRegion region; +} VhostUserMemRegMsg; + typedef struct VhostUserLog { uint64_t mmap_size; uint64_t mmap_offset; @@ -186,6 +193,7 @@ typedef union { struct vhost_vring_state state; struct vhost_vring_addr addr; VhostUserMemory memory; + VhostUserMemRegMsg mem_reg; VhostUserLog log; struct vhost_iotlb_msg iotlb; VhostUserConfig config; @@ -226,6 +234,16 @@ struct vhost_user { /* True once we've entered postcopy_listen */ bool postcopy_listen; + + /* Our current regions */ + int num_shadow_regions; + struct vhost_memory_region shadow_regions[VHOST_MEMORY_MAX_NREGIONS]; +}; + +struct scrub_regions { + struct vhost_memory_region *region; + int reg_idx; + int fd_idx; }; static bool ioeventfd_enabled(void) @@ -489,8 +507,332 @@ static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u, return 1; } +static inline bool reg_equal(struct vhost_memory_region *shadow_reg, + struct vhost_memory_region *vdev_reg) +{ + return shadow_reg->guest_phys_addr == vdev_reg->guest_phys_addr && + shadow_reg->userspace_addr == vdev_reg->userspace_addr && + shadow_reg->memory_size == vdev_reg->memory_size; +} + +static void scrub_shadow_regions(struct vhost_dev *dev, + struct scrub_regions *add_reg, + int *nr_add_reg, + struct scrub_regions *rem_reg, + int *nr_rem_reg, uint64_t *shadow_pcb, + bool track_ramblocks) +{ + struct vhost_user *u = dev->opaque; + bool found[VHOST_MEMORY_MAX_NREGIONS] = {}; + struct vhost_memory_region *reg, *shadow_reg; + int i, j, fd, add_idx = 0, rm_idx = 0, fd_num = 0; + ram_addr_t offset; + MemoryRegion *mr; + bool matching; + + /* + * Find memory regions present in our shadow state which are not in + * the device's current memory state. + * + * Mark regions in both the shadow and device state as "found". + */ + for (i = 0; i < u->num_shadow_regions; i++) { + shadow_reg = &u->shadow_regions[i]; + matching = false; + + for (j = 0; j < dev->mem->nregions; j++) { + reg = &dev->mem->regions[j]; + + mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd); + + if (reg_equal(shadow_reg, reg)) { + matching = true; + found[j] = true; + if (track_ramblocks) { + /* + * Reset postcopy client bases, region_rb, and + * region_rb_offset in case regions are removed. + */ + if (fd > 0) { + u->region_rb_offset[j] = offset; + u->region_rb[j] = mr->ram_block; + shadow_pcb[j] = u->postcopy_client_bases[i]; + } else { + u->region_rb_offset[j] = 0; + u->region_rb[j] = NULL; + } + } + break; + } + } + + /* + * If the region was not found in the current device memory state + * create an entry for it in the removed list. + */ + if (!matching) { + rem_reg[rm_idx].region = shadow_reg; + rem_reg[rm_idx++].reg_idx = i; + } + } + + /* + * For regions not marked "found", create entries in the added list. + * + * Note their indexes in the device memory state and the indexes of their + * file descriptors. + */ + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd); + if (fd > 0) { + ++fd_num; + } + + /* + * If the region was in both the shadow and device state we don't + * need to send a VHOST_USER_ADD_MEM_REG message for it. + */ + if (found[i]) { + continue; + } + + add_reg[add_idx].region = reg; + add_reg[add_idx].reg_idx = i; + add_reg[add_idx++].fd_idx = fd_num; + } + *nr_rem_reg = rm_idx; + *nr_add_reg = add_idx; + + return; +} + +static int send_remove_regions(struct vhost_dev *dev, + struct scrub_regions *remove_reg, + int nr_rem_reg, VhostUserMsg *msg, + bool reply_supported) +{ + struct vhost_user *u = dev->opaque; + struct vhost_memory_region *shadow_reg; + int i, fd, shadow_reg_idx, ret; + ram_addr_t offset; + VhostUserMemoryRegion region_buffer; + + /* + * The regions in remove_reg appear in the same order they do in the + * shadow table. Therefore we can minimize memory copies by iterating + * through remove_reg backwards. + */ + for (i = nr_rem_reg - 1; i >= 0; i--) { + shadow_reg = remove_reg[i].region; + shadow_reg_idx = remove_reg[i].reg_idx; + + vhost_user_get_mr_data(shadow_reg->userspace_addr, &offset, &fd); + + if (fd > 0) { + msg->hdr.request = VHOST_USER_REM_MEM_REG; + vhost_user_fill_msg_region(®ion_buffer, shadow_reg); + msg->payload.mem_reg.region = region_buffer; + + if (vhost_user_write(dev, msg, &fd, 1) < 0) { + return -1; + } + + if (reply_supported) { + ret = process_message_reply(dev, msg); + if (ret) { + return ret; + } + } + } + + /* + * At this point we know the backend has unmapped the region. It is now + * safe to remove it from the shadow table. + */ + memmove(&u->shadow_regions[shadow_reg_idx], + &u->shadow_regions[shadow_reg_idx + 1], + sizeof(struct vhost_memory_region) * + (u->num_shadow_regions - shadow_reg_idx)); + u->num_shadow_regions--; + } + + return 0; +} + +static int send_add_regions(struct vhost_dev *dev, + struct scrub_regions *add_reg, int nr_add_reg, + VhostUserMsg *msg, uint64_t *shadow_pcb, + bool reply_supported, bool track_ramblocks) +{ + struct vhost_user *u = dev->opaque; + int i, fd, ret, reg_idx, reg_fd_idx; + struct vhost_memory_region *reg; + MemoryRegion *mr; + ram_addr_t offset; + VhostUserMsg msg_reply; + VhostUserMemoryRegion region_buffer; + + for (i = 0; i < nr_add_reg; i++) { + reg = add_reg[i].region; + reg_idx = add_reg[i].reg_idx; + reg_fd_idx = add_reg[i].fd_idx; + + mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd); + + if (fd > 0) { + if (track_ramblocks) { + trace_vhost_user_set_mem_table_withfd(reg_fd_idx, mr->name, + reg->memory_size, + reg->guest_phys_addr, + reg->userspace_addr, + offset); + u->region_rb_offset[reg_idx] = offset; + u->region_rb[reg_idx] = mr->ram_block; + } + msg->hdr.request = VHOST_USER_ADD_MEM_REG; + vhost_user_fill_msg_region(®ion_buffer, reg); + msg->payload.mem_reg.region = region_buffer; + msg->payload.mem_reg.region.mmap_offset = offset; + + if (vhost_user_write(dev, msg, &fd, 1) < 0) { + return -1; + } + + if (track_ramblocks) { + uint64_t reply_gpa; + + if (vhost_user_read(dev, &msg_reply) < 0) { + return -1; + } + + reply_gpa = msg_reply.payload.mem_reg.region.guest_phys_addr; + + if (msg_reply.hdr.request != VHOST_USER_ADD_MEM_REG) { + error_report("%s: Received unexpected msg type." + "Expected %d received %d", __func__, + VHOST_USER_ADD_MEM_REG, + msg_reply.hdr.request); + return -1; + } + + /* + * We're using the same structure, just reusing one of the + * fields, so it should be the same size. + */ + if (msg_reply.hdr.size != msg->hdr.size) { + error_report("%s: Unexpected size for postcopy reply " + "%d vs %d", __func__, msg_reply.hdr.size, + msg->hdr.size); + return -1; + } + + /* Get the postcopy client base from the backend's reply. */ + if (reply_gpa == dev->mem->regions[reg_idx].guest_phys_addr) { + shadow_pcb[reg_idx] = + msg_reply.payload.mem_reg.region.userspace_addr; + trace_vhost_user_set_mem_table_postcopy( + msg_reply.payload.mem_reg.region.userspace_addr, + msg->payload.mem_reg.region.userspace_addr, + reg_fd_idx, reg_idx); + } else { + error_report("%s: invalid postcopy reply for region. " + "Got guest physical address %" PRIX64 ", expected " + "%" PRIX64, __func__, reply_gpa, + dev->mem->regions[reg_idx].guest_phys_addr); + return -1; + } + } else if (reply_supported) { + ret = process_message_reply(dev, msg); + if (ret) { + return ret; + } + } + } else if (track_ramblocks) { + u->region_rb_offset[reg_idx] = 0; + u->region_rb[reg_idx] = NULL; + } + + /* + * At this point, we know the backend has mapped in the new + * region, if the region has a valid file descriptor. + * + * The region should now be added to the shadow table. + */ + u->shadow_regions[u->num_shadow_regions].guest_phys_addr = + reg->guest_phys_addr; + u->shadow_regions[u->num_shadow_regions].userspace_addr = + reg->userspace_addr; + u->shadow_regions[u->num_shadow_regions].memory_size = + reg->memory_size; + u->num_shadow_regions++; + } + + return 0; +} + +static int vhost_user_add_remove_regions(struct vhost_dev *dev, + VhostUserMsg *msg, + bool reply_supported, + bool track_ramblocks) +{ + struct vhost_user *u = dev->opaque; + struct scrub_regions add_reg[VHOST_MEMORY_MAX_NREGIONS]; + struct scrub_regions rem_reg[VHOST_MEMORY_MAX_NREGIONS]; + uint64_t shadow_pcb[VHOST_MEMORY_MAX_NREGIONS] = {}; + int nr_add_reg, nr_rem_reg; + + msg->hdr.size = sizeof(msg->payload.mem_reg.padding) + + sizeof(VhostUserMemoryRegion); + + /* Find the regions which need to be removed or added. */ + scrub_shadow_regions(dev, add_reg, &nr_add_reg, rem_reg, &nr_rem_reg, + shadow_pcb, track_ramblocks); + + if (nr_rem_reg && send_remove_regions(dev, rem_reg, nr_rem_reg, msg, + reply_supported) < 0) + { + goto err; + } + + if (nr_add_reg && send_add_regions(dev, add_reg, nr_add_reg, msg, + shadow_pcb, reply_supported, track_ramblocks) < 0) + { + goto err; + } + + if (track_ramblocks) { + memcpy(u->postcopy_client_bases, shadow_pcb, + sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS); + /* + * Now we've registered this with the postcopy code, we ack to the + * client, because now we're in the position to be able to deal with + * any faults it generates. + */ + /* TODO: Use this for failure cases as well with a bad value. */ + msg->hdr.size = sizeof(msg->payload.u64); + msg->payload.u64 = 0; /* OK */ + + if (vhost_user_write(dev, msg, NULL, 0) < 0) { + return -1; + } + } + + return 0; + +err: + if (track_ramblocks) { + memcpy(u->postcopy_client_bases, shadow_pcb, + sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS); + } + + return -1; +} + static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev, - struct vhost_memory *mem) + struct vhost_memory *mem, + bool reply_supported, + bool config_mem_slots) { struct vhost_user *u = dev->opaque; int fds[VHOST_MEMORY_MAX_NREGIONS]; @@ -513,71 +855,84 @@ static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev, u->region_rb_len = dev->mem->nregions; } - if (vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num, + if (config_mem_slots) { + if (vhost_user_add_remove_regions(dev, &msg, reply_supported, true) < 0) { - return -1; - } - - if (vhost_user_write(dev, &msg, fds, fd_num) < 0) { - return -1; - } - - if (vhost_user_read(dev, &msg_reply) < 0) { - return -1; - } - - if (msg_reply.hdr.request != VHOST_USER_SET_MEM_TABLE) { - error_report("%s: Received unexpected msg type." - "Expected %d received %d", __func__, - VHOST_USER_SET_MEM_TABLE, msg_reply.hdr.request); - return -1; - } - /* We're using the same structure, just reusing one of the - * fields, so it should be the same size. - */ - if (msg_reply.hdr.size != msg.hdr.size) { - error_report("%s: Unexpected size for postcopy reply " - "%d vs %d", __func__, msg_reply.hdr.size, msg.hdr.size); - return -1; - } - - memset(u->postcopy_client_bases, 0, - sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS); - - /* They're in the same order as the regions that were sent - * but some of the regions were skipped (above) if they - * didn't have fd's - */ - for (msg_i = 0, region_i = 0; - region_i < dev->mem->nregions; - region_i++) { - if (msg_i < fd_num && - msg_reply.payload.memory.regions[msg_i].guest_phys_addr == - dev->mem->regions[region_i].guest_phys_addr) { - u->postcopy_client_bases[region_i] = - msg_reply.payload.memory.regions[msg_i].userspace_addr; - trace_vhost_user_set_mem_table_postcopy( - msg_reply.payload.memory.regions[msg_i].userspace_addr, - msg.payload.memory.regions[msg_i].userspace_addr, - msg_i, region_i); - msg_i++; + return -1; + } + } else { + if (vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num, + true) < 0) { + return -1; + } + + if (vhost_user_write(dev, &msg, fds, fd_num) < 0) { + return -1; + } + + if (vhost_user_read(dev, &msg_reply) < 0) { + return -1; + } + + if (msg_reply.hdr.request != VHOST_USER_SET_MEM_TABLE) { + error_report("%s: Received unexpected msg type." + "Expected %d received %d", __func__, + VHOST_USER_SET_MEM_TABLE, msg_reply.hdr.request); + return -1; + } + + /* + * We're using the same structure, just reusing one of the + * fields, so it should be the same size. + */ + if (msg_reply.hdr.size != msg.hdr.size) { + error_report("%s: Unexpected size for postcopy reply " + "%d vs %d", __func__, msg_reply.hdr.size, + msg.hdr.size); + return -1; + } + + memset(u->postcopy_client_bases, 0, + sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS); + + /* + * They're in the same order as the regions that were sent + * but some of the regions were skipped (above) if they + * didn't have fd's + */ + for (msg_i = 0, region_i = 0; + region_i < dev->mem->nregions; + region_i++) { + if (msg_i < fd_num && + msg_reply.payload.memory.regions[msg_i].guest_phys_addr == + dev->mem->regions[region_i].guest_phys_addr) { + u->postcopy_client_bases[region_i] = + msg_reply.payload.memory.regions[msg_i].userspace_addr; + trace_vhost_user_set_mem_table_postcopy( + msg_reply.payload.memory.regions[msg_i].userspace_addr, + msg.payload.memory.regions[msg_i].userspace_addr, + msg_i, region_i); + msg_i++; + } + } + if (msg_i != fd_num) { + error_report("%s: postcopy reply not fully consumed " + "%d vs %zd", + __func__, msg_i, fd_num); + return -1; + } + + /* + * Now we've registered this with the postcopy code, we ack to the + * client, because now we're in the position to be able to deal + * with any faults it generates. + */ + /* TODO: Use this for failure cases as well with a bad value. */ + msg.hdr.size = sizeof(msg.payload.u64); + msg.payload.u64 = 0; /* OK */ + if (vhost_user_write(dev, &msg, NULL, 0) < 0) { + return -1; } - } - if (msg_i != fd_num) { - error_report("%s: postcopy reply not fully consumed " - "%d vs %zd", - __func__, msg_i, fd_num); - return -1; - } - /* Now we've registered this with the postcopy code, we ack to the client, - * because now we're in the position to be able to deal with any faults - * it generates. - */ - /* TODO: Use this for failure cases as well with a bad value */ - msg.hdr.size = sizeof(msg.payload.u64); - msg.payload.u64 = 0; /* OK */ - if (vhost_user_write(dev, &msg, NULL, 0) < 0) { - return -1; } return 0; @@ -592,12 +947,17 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev, bool do_postcopy = u->postcopy_listen && u->postcopy_fd.handler; bool reply_supported = virtio_has_feature(dev->protocol_features, VHOST_USER_PROTOCOL_F_REPLY_ACK); + bool config_mem_slots = + virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS); if (do_postcopy) { - /* Postcopy has enough differences that it's best done in it's own + /* + * Postcopy has enough differences that it's best done in it's own * version */ - return vhost_user_set_mem_table_postcopy(dev, mem); + return vhost_user_set_mem_table_postcopy(dev, mem, reply_supported, + config_mem_slots); } VhostUserMsg msg = { @@ -608,17 +968,23 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev, msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK; } - if (vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num, + if (config_mem_slots) { + if (vhost_user_add_remove_regions(dev, &msg, reply_supported, false) < 0) { - return -1; - } + return -1; + } + } else { + if (vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num, + false) < 0) { + return -1; + } + if (vhost_user_write(dev, &msg, fds, fd_num) < 0) { + return -1; + } - if (vhost_user_write(dev, &msg, fds, fd_num) < 0) { - return -1; - } - - if (reply_supported) { - return process_message_reply(dev, &msg); + if (reply_supported) { + return process_message_reply(dev, &msg); + } } return 0; From 27598393a23215cfbf92ad550b9541675b0b8f2b Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Thu, 21 May 2020 05:00:40 +0000 Subject: [PATCH 40/58] Lift max memory slots limit imposed by vhost-user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Historically, sending all memory regions to vhost-user backends in a single message imposed a limitation on the number of times memory could be hot-added to a VM with a vhost-user device. Now that backends which support the VHOST_USER_PROTOCOL_F_CONFIGURE_SLOTS send memory regions individually, we no longer need to impose this limitation on devices which support this feature. With this change, VMs with a vhost-user device which supports the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS can support a configurable number of memory slots, up to the maximum allowed by the target platform. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz Signed-off-by: Peter Turschmid Suggested-by: Mike Cui Message-Id: <1588533678-23450-6-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Marc-André Lureau --- docs/interop/vhost-user.rst | 7 ++--- hw/virtio/vhost-user.c | 56 ++++++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index 037eefab0e..688b7c6900 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -1273,10 +1273,9 @@ Master message types feature has been successfully negotiated, this message is submitted by master to the slave. The slave should return the message with a u64 payload containing the maximum number of memory slots for - QEMU to expose to the guest. At this point, the value returned - by the backend will be capped at the maximum number of ram slots - which can be supported by vhost-user. Currently that limit is set - at VHOST_USER_MAX_RAM_SLOTS = 8. + QEMU to expose to the guest. The value returned by the backend + will be capped at the maximum number of ram slots which can be + supported by the target platform. ``VHOST_USER_ADD_MEM_REG`` :id: 37 diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 3640f017a2..4d6cd4e58a 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -35,10 +35,28 @@ #include #endif -#define VHOST_MEMORY_MAX_NREGIONS 8 +#define VHOST_MEMORY_BASELINE_NREGIONS 8 #define VHOST_USER_F_PROTOCOL_FEATURES 30 #define VHOST_USER_SLAVE_MAX_FDS 8 +/* + * Set maximum number of RAM slots supported to + * the maximum number supported by the target + * hardware plaform. + */ +#if defined(TARGET_X86) || defined(TARGET_X86_64) || \ + defined(TARGET_ARM) || defined(TARGET_ARM_64) +#include "hw/acpi/acpi.h" +#define VHOST_USER_MAX_RAM_SLOTS ACPI_MAX_RAM_SLOTS + +#elif defined(TARGET_PPC) || defined(TARGET_PPC_64) +#include "hw/ppc/spapr.h" +#define VHOST_USER_MAX_RAM_SLOTS SPAPR_MAX_RAM_SLOTS + +#else +#define VHOST_USER_MAX_RAM_SLOTS 512 +#endif + /* * Maximum size of virtio device config space */ @@ -127,7 +145,7 @@ typedef struct VhostUserMemoryRegion { typedef struct VhostUserMemory { uint32_t nregions; uint32_t padding; - VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; + VhostUserMemoryRegion regions[VHOST_MEMORY_BASELINE_NREGIONS]; } VhostUserMemory; typedef struct VhostUserMemRegMsg { @@ -222,7 +240,7 @@ struct vhost_user { int slave_fd; NotifierWithReturn postcopy_notifier; struct PostCopyFD postcopy_fd; - uint64_t postcopy_client_bases[VHOST_MEMORY_MAX_NREGIONS]; + uint64_t postcopy_client_bases[VHOST_USER_MAX_RAM_SLOTS]; /* Length of the region_rb and region_rb_offset arrays */ size_t region_rb_len; /* RAMBlock associated with a given region */ @@ -237,7 +255,7 @@ struct vhost_user { /* Our current regions */ int num_shadow_regions; - struct vhost_memory_region shadow_regions[VHOST_MEMORY_MAX_NREGIONS]; + struct vhost_memory_region shadow_regions[VHOST_USER_MAX_RAM_SLOTS]; }; struct scrub_regions { @@ -392,7 +410,7 @@ int vhost_user_gpu_set_socket(struct vhost_dev *dev, int fd) static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base, struct vhost_log *log) { - int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fds[VHOST_USER_MAX_RAM_SLOTS]; size_t fd_num = 0; bool shmfd = virtio_has_feature(dev->protocol_features, VHOST_USER_PROTOCOL_F_LOG_SHMFD); @@ -470,7 +488,7 @@ static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u, mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd); if (fd > 0) { if (track_ramblocks) { - assert(*fd_num < VHOST_MEMORY_MAX_NREGIONS); + assert(*fd_num < VHOST_MEMORY_BASELINE_NREGIONS); trace_vhost_user_set_mem_table_withfd(*fd_num, mr->name, reg->memory_size, reg->guest_phys_addr, @@ -478,7 +496,7 @@ static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u, offset); u->region_rb_offset[i] = offset; u->region_rb[i] = mr->ram_block; - } else if (*fd_num == VHOST_MEMORY_MAX_NREGIONS) { + } else if (*fd_num == VHOST_MEMORY_BASELINE_NREGIONS) { error_report("Failed preparing vhost-user memory table msg"); return -1; } @@ -523,7 +541,7 @@ static void scrub_shadow_regions(struct vhost_dev *dev, bool track_ramblocks) { struct vhost_user *u = dev->opaque; - bool found[VHOST_MEMORY_MAX_NREGIONS] = {}; + bool found[VHOST_USER_MAX_RAM_SLOTS] = {}; struct vhost_memory_region *reg, *shadow_reg; int i, j, fd, add_idx = 0, rm_idx = 0, fd_num = 0; ram_addr_t offset; @@ -777,9 +795,9 @@ static int vhost_user_add_remove_regions(struct vhost_dev *dev, bool track_ramblocks) { struct vhost_user *u = dev->opaque; - struct scrub_regions add_reg[VHOST_MEMORY_MAX_NREGIONS]; - struct scrub_regions rem_reg[VHOST_MEMORY_MAX_NREGIONS]; - uint64_t shadow_pcb[VHOST_MEMORY_MAX_NREGIONS] = {}; + struct scrub_regions add_reg[VHOST_USER_MAX_RAM_SLOTS]; + struct scrub_regions rem_reg[VHOST_USER_MAX_RAM_SLOTS]; + uint64_t shadow_pcb[VHOST_USER_MAX_RAM_SLOTS] = {}; int nr_add_reg, nr_rem_reg; msg->hdr.size = sizeof(msg->payload.mem_reg.padding) + @@ -803,7 +821,7 @@ static int vhost_user_add_remove_regions(struct vhost_dev *dev, if (track_ramblocks) { memcpy(u->postcopy_client_bases, shadow_pcb, - sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS); + sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS); /* * Now we've registered this with the postcopy code, we ack to the * client, because now we're in the position to be able to deal with @@ -823,7 +841,7 @@ static int vhost_user_add_remove_regions(struct vhost_dev *dev, err: if (track_ramblocks) { memcpy(u->postcopy_client_bases, shadow_pcb, - sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS); + sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS); } return -1; @@ -835,7 +853,7 @@ static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev, bool config_mem_slots) { struct vhost_user *u = dev->opaque; - int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fds[VHOST_MEMORY_BASELINE_NREGIONS]; size_t fd_num = 0; VhostUserMsg msg_reply; int region_i, msg_i; @@ -893,7 +911,7 @@ static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev, } memset(u->postcopy_client_bases, 0, - sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS); + sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS); /* * They're in the same order as the regions that were sent @@ -942,7 +960,7 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev, struct vhost_memory *mem) { struct vhost_user *u = dev->opaque; - int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fds[VHOST_MEMORY_BASELINE_NREGIONS]; size_t fd_num = 0; bool do_postcopy = u->postcopy_listen && u->postcopy_fd.handler; bool reply_supported = virtio_has_feature(dev->protocol_features, @@ -1149,7 +1167,7 @@ static int vhost_set_vring_file(struct vhost_dev *dev, VhostUserRequest request, struct vhost_vring_file *file) { - int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fds[VHOST_USER_MAX_RAM_SLOTS]; size_t fd_num = 0; VhostUserMsg msg = { .hdr.request = request, @@ -1845,7 +1863,7 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque) /* get max memory regions if backend supports configurable RAM slots */ if (!virtio_has_feature(dev->protocol_features, VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS)) { - u->user->memory_slots = VHOST_MEMORY_MAX_NREGIONS; + u->user->memory_slots = VHOST_MEMORY_BASELINE_NREGIONS; } else { err = vhost_user_get_max_memslots(dev, &ram_slots); if (err < 0) { @@ -1860,7 +1878,7 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque) return -1; } - u->user->memory_slots = MIN(ram_slots, VHOST_MEMORY_MAX_NREGIONS); + u->user->memory_slots = MIN(ram_slots, VHOST_USER_MAX_RAM_SLOTS); } } From 08fccf8f077397b47c740d4850cdeedc947f21ac Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Thu, 21 May 2020 05:00:47 +0000 Subject: [PATCH 41/58] Refactor out libvhost-user fault generation logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In libvhost-user, the incoming postcopy migration path for setting the backend's memory tables has become convolued. In particular, moving the logic which starts generating faults, having received the final ACK from qemu can be moved to a separate function. This simplifies the code substantially. This logic will also be needed by the postcopy path once the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS feature is supported. Signed-off-by: Raphael Norwitz Message-Id: <1588533678-23450-7-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Marc-André Lureau --- contrib/libvhost-user/libvhost-user.c | 147 ++++++++++++++------------ 1 file changed, 79 insertions(+), 68 deletions(-) diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index 3bca996c62..cccfa22209 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -583,6 +583,84 @@ map_ring(VuDev *dev, VuVirtq *vq) return !(vq->vring.desc && vq->vring.used && vq->vring.avail); } +static bool +generate_faults(VuDev *dev) { + int i; + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *dev_region = &dev->regions[i]; + int ret; +#ifdef UFFDIO_REGISTER + /* + * We should already have an open ufd. Mark each memory + * range as ufd. + * Discard any mapping we have here; note I can't use MADV_REMOVE + * or fallocate to make the hole since I don't want to lose + * data that's already arrived in the shared process. + * TODO: How to do hugepage + */ + ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, + dev_region->size + dev_region->mmap_offset, + MADV_DONTNEED); + if (ret) { + fprintf(stderr, + "%s: Failed to madvise(DONTNEED) region %d: %s\n", + __func__, i, strerror(errno)); + } + /* + * Turn off transparent hugepages so we dont get lose wakeups + * in neighbouring pages. + * TODO: Turn this backon later. + */ + ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, + dev_region->size + dev_region->mmap_offset, + MADV_NOHUGEPAGE); + if (ret) { + /* + * Note: This can happen legally on kernels that are configured + * without madvise'able hugepages + */ + fprintf(stderr, + "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n", + __func__, i, strerror(errno)); + } + struct uffdio_register reg_struct; + reg_struct.range.start = (uintptr_t)dev_region->mmap_addr; + reg_struct.range.len = dev_region->size + dev_region->mmap_offset; + reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; + + if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, ®_struct)) { + vu_panic(dev, "%s: Failed to userfault region %d " + "@%p + size:%zx offset: %zx: (ufd=%d)%s\n", + __func__, i, + dev_region->mmap_addr, + dev_region->size, dev_region->mmap_offset, + dev->postcopy_ufd, strerror(errno)); + return false; + } + if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) { + vu_panic(dev, "%s Region (%d) doesn't support COPY", + __func__, i); + return false; + } + DPRINT("%s: region %d: Registered userfault for %" + PRIx64 " + %" PRIx64 "\n", __func__, i, + (uint64_t)reg_struct.range.start, + (uint64_t)reg_struct.range.len); + /* Now it's registered we can let the client at it */ + if (mprotect((void *)(uintptr_t)dev_region->mmap_addr, + dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE)) { + vu_panic(dev, "failed to mprotect region %d for postcopy (%s)", + i, strerror(errno)); + return false; + } + /* TODO: Stash 'zero' support flags somewhere */ +#endif + } + + return true; +} + static bool vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) { @@ -655,74 +733,7 @@ vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) } /* OK, now we can go and register the memory and generate faults */ - for (i = 0; i < dev->nregions; i++) { - VuDevRegion *dev_region = &dev->regions[i]; - int ret; -#ifdef UFFDIO_REGISTER - /* We should already have an open ufd. Mark each memory - * range as ufd. - * Discard any mapping we have here; note I can't use MADV_REMOVE - * or fallocate to make the hole since I don't want to lose - * data that's already arrived in the shared process. - * TODO: How to do hugepage - */ - ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, - dev_region->size + dev_region->mmap_offset, - MADV_DONTNEED); - if (ret) { - fprintf(stderr, - "%s: Failed to madvise(DONTNEED) region %d: %s\n", - __func__, i, strerror(errno)); - } - /* Turn off transparent hugepages so we dont get lose wakeups - * in neighbouring pages. - * TODO: Turn this backon later. - */ - ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, - dev_region->size + dev_region->mmap_offset, - MADV_NOHUGEPAGE); - if (ret) { - /* Note: This can happen legally on kernels that are configured - * without madvise'able hugepages - */ - fprintf(stderr, - "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n", - __func__, i, strerror(errno)); - } - struct uffdio_register reg_struct; - reg_struct.range.start = (uintptr_t)dev_region->mmap_addr; - reg_struct.range.len = dev_region->size + dev_region->mmap_offset; - reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; - - if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, ®_struct)) { - vu_panic(dev, "%s: Failed to userfault region %d " - "@%p + size:%zx offset: %zx: (ufd=%d)%s\n", - __func__, i, - dev_region->mmap_addr, - dev_region->size, dev_region->mmap_offset, - dev->postcopy_ufd, strerror(errno)); - return false; - } - if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) { - vu_panic(dev, "%s Region (%d) doesn't support COPY", - __func__, i); - return false; - } - DPRINT("%s: region %d: Registered userfault for %" - PRIx64 " + %" PRIx64 "\n", __func__, i, - (uint64_t)reg_struct.range.start, - (uint64_t)reg_struct.range.len); - /* Now it's registered we can let the client at it */ - if (mprotect((void *)(uintptr_t)dev_region->mmap_addr, - dev_region->size + dev_region->mmap_offset, - PROT_READ | PROT_WRITE)) { - vu_panic(dev, "failed to mprotect region %d for postcopy (%s)", - i, strerror(errno)); - return false; - } - /* TODO: Stash 'zero' support flags somewhere */ -#endif - } + (void)generate_faults(dev); return false; } From 6fb2e173d20c9bbb5466183d33a3ad7dcd0375fa Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Thu, 21 May 2020 05:00:50 +0000 Subject: [PATCH 42/58] Support ram slot configuration in libvhost-user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VHOST_USER_GET_MAX_MEM_SLOTS message allows a vhost-user backend to specify a maximum number of ram slots it is willing to support. This change adds support for libvhost-user to process this message. For now the backend will reply with 8 as the maximum number of regions supported. libvhost-user does not yet support the vhost-user protocol feature VHOST_USER_PROTOCOL_F_CONFIGIRE_MEM_SLOTS, so qemu should never send the VHOST_USER_GET_MAX_MEM_SLOTS message. Therefore this new functionality is not currently used. Signed-off-by: Raphael Norwitz Message-Id: <1588533678-23450-8-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Marc-André Lureau --- contrib/libvhost-user/libvhost-user.c | 19 +++++++++++++++++++ contrib/libvhost-user/libvhost-user.h | 1 + 2 files changed, 20 insertions(+) diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index cccfa22209..9f039b707e 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -137,6 +137,7 @@ vu_request_to_string(unsigned int req) REQ(VHOST_USER_SET_INFLIGHT_FD), REQ(VHOST_USER_GPU_SET_SOCKET), REQ(VHOST_USER_VRING_KICK), + REQ(VHOST_USER_GET_MAX_MEM_SLOTS), REQ(VHOST_USER_MAX), }; #undef REQ @@ -1565,6 +1566,22 @@ vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg) return false; } +static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg) +{ + vmsg->flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION; + vmsg->size = sizeof(vmsg->payload.u64); + vmsg->payload.u64 = VHOST_MEMORY_MAX_NREGIONS; + vmsg->fd_num = 0; + + if (!vu_message_write(dev, dev->sock, vmsg)) { + vu_panic(dev, "Failed to send max ram slots: %s\n", strerror(errno)); + } + + DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_MEMORY_MAX_NREGIONS); + + return false; +} + static bool vu_process_message(VuDev *dev, VhostUserMsg *vmsg) { @@ -1649,6 +1666,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg) return vu_set_inflight_fd(dev, vmsg); case VHOST_USER_VRING_KICK: return vu_handle_vring_kick(dev, vmsg); + case VHOST_USER_GET_MAX_MEM_SLOTS: + return vu_handle_get_max_memslots(dev, vmsg); default: vmsg_close_fds(vmsg); vu_panic(dev, "Unhandled request: %d", vmsg->request); diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h index f30394fab6..88ef40d26a 100644 --- a/contrib/libvhost-user/libvhost-user.h +++ b/contrib/libvhost-user/libvhost-user.h @@ -97,6 +97,7 @@ typedef enum VhostUserRequest { VHOST_USER_SET_INFLIGHT_FD = 32, VHOST_USER_GPU_SET_SOCKET = 33, VHOST_USER_VRING_KICK = 35, + VHOST_USER_GET_MAX_MEM_SLOTS = 36, VHOST_USER_MAX } VhostUserRequest; From ec94c8e621de96c50c2d381c8c9ec94f5beec7c1 Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Thu, 21 May 2020 05:00:52 +0000 Subject: [PATCH 43/58] Support adding individual regions in libvhost-user When the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS is enabled, qemu will transmit memory regions to a backend individually using the new message VHOST_USER_ADD_MEM_REG. With this change vhost-user backends built with libvhost-user can now map in new memory regions when VHOST_USER_ADD_MEM_REG messages are received. Qemu only sends VHOST_USER_ADD_MEM_REG messages when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS feature is negotiated, and since it is not yet supported in libvhost-user, this new functionality is not yet used. Signed-off-by: Raphael Norwitz Message-Id: <1588533678-23450-9-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- contrib/libvhost-user/libvhost-user.c | 103 ++++++++++++++++++++++++++ contrib/libvhost-user/libvhost-user.h | 7 ++ 2 files changed, 110 insertions(+) diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index 9f039b707e..d8ee7a23a3 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -138,6 +138,7 @@ vu_request_to_string(unsigned int req) REQ(VHOST_USER_GPU_SET_SOCKET), REQ(VHOST_USER_VRING_KICK), REQ(VHOST_USER_GET_MAX_MEM_SLOTS), + REQ(VHOST_USER_ADD_MEM_REG), REQ(VHOST_USER_MAX), }; #undef REQ @@ -662,6 +663,106 @@ generate_faults(VuDev *dev) { return true; } +static bool +vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { + int i; + bool track_ramblocks = dev->postcopy_listening; + VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; + VuDevRegion *dev_region = &dev->regions[dev->nregions]; + void *mmap_addr; + + /* + * If we are in postcopy mode and we receive a u64 payload with a 0 value + * we know all the postcopy client bases have been recieved, and we + * should start generating faults. + */ + if (track_ramblocks && + vmsg->size == sizeof(vmsg->payload.u64) && + vmsg->payload.u64 == 0) { + (void)generate_faults(dev); + return false; + } + + DPRINT("Adding region: %d\n", dev->nregions); + DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", + msg_region->guest_phys_addr); + DPRINT(" memory_size: 0x%016"PRIx64"\n", + msg_region->memory_size); + DPRINT(" userspace_addr 0x%016"PRIx64"\n", + msg_region->userspace_addr); + DPRINT(" mmap_offset 0x%016"PRIx64"\n", + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* + * We don't use offset argument of mmap() since the + * mapped address has to be page aligned, and we use huge + * pages. + */ + if (track_ramblocks) { + /* + * In postcopy we're using PROT_NONE here to catch anyone + * accessing it before we userfault. + */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_NONE, MAP_SHARED, + vmsg->fds[0], 0); + } else { + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED, vmsg->fds[0], + 0); + } + + if (mmap_addr == MAP_FAILED) { + vu_panic(dev, "region mmap error: %s", strerror(errno)); + } else { + dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + DPRINT(" mmap_addr: 0x%016"PRIx64"\n", + dev_region->mmap_addr); + } + + close(vmsg->fds[0]); + + if (track_ramblocks) { + /* + * Return the address to QEMU so that it can translate the ufd + * fault addresses back. + */ + msg_region->userspace_addr = (uintptr_t)(mmap_addr + + dev_region->mmap_offset); + + /* Send the message back to qemu with the addresses filled in. */ + vmsg->fd_num = 0; + if (!vu_send_reply(dev, dev->sock, vmsg)) { + vu_panic(dev, "failed to respond to add-mem-region for postcopy"); + return false; + } + + DPRINT("Successfully added new region in postcopy\n"); + dev->nregions++; + return false; + + } else { + for (i = 0; i < dev->max_queues; i++) { + if (dev->vq[i].vring.desc) { + if (map_ring(dev, &dev->vq[i])) { + vu_panic(dev, "remapping queue %d for new memory region", + i); + } + } + } + + DPRINT("Successfully added new region\n"); + dev->nregions++; + vmsg_set_reply_u64(vmsg, 0); + return true; + } +} + static bool vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) { @@ -1668,6 +1769,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg) return vu_handle_vring_kick(dev, vmsg); case VHOST_USER_GET_MAX_MEM_SLOTS: return vu_handle_get_max_memslots(dev, vmsg); + case VHOST_USER_ADD_MEM_REG: + return vu_add_mem_reg(dev, vmsg); default: vmsg_close_fds(vmsg); vu_panic(dev, "Unhandled request: %d", vmsg->request); diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h index 88ef40d26a..60ef7fd13e 100644 --- a/contrib/libvhost-user/libvhost-user.h +++ b/contrib/libvhost-user/libvhost-user.h @@ -98,6 +98,7 @@ typedef enum VhostUserRequest { VHOST_USER_GPU_SET_SOCKET = 33, VHOST_USER_VRING_KICK = 35, VHOST_USER_GET_MAX_MEM_SLOTS = 36, + VHOST_USER_ADD_MEM_REG = 37, VHOST_USER_MAX } VhostUserRequest; @@ -124,6 +125,11 @@ typedef struct VhostUserMemory { VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; } VhostUserMemory; +typedef struct VhostUserMemRegMsg { + uint32_t padding; + VhostUserMemoryRegion region; +} VhostUserMemRegMsg; + typedef struct VhostUserLog { uint64_t mmap_size; uint64_t mmap_offset; @@ -176,6 +182,7 @@ typedef struct VhostUserMsg { struct vhost_vring_state state; struct vhost_vring_addr addr; VhostUserMemory memory; + VhostUserMemRegMsg memreg; VhostUserLog log; VhostUserConfig config; VhostUserVringArea area; From 875b9fd97b34dae796d61330804b094ca7b1b403 Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Thu, 21 May 2020 05:00:56 +0000 Subject: [PATCH 44/58] Support individual region unmap in libvhost-user When the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature is enabled, on memory hot-unplug qemu will transmit memory regions to remove individually using the new message VHOST_USER_REM_MEM_REG message. With this change, vhost-user backends build with libvhost-user can now unmap individual memory regions when receiving the VHOST_USER_REM_MEM_REG message. Qemu only sends VHOST_USER_REM_MEM_REG messages when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS feature is negotiated, and support for that feature has not yet been added in libvhost-user, this new functionality is not yet used. Signed-off-by: Raphael Norwitz Message-Id: <1588533678-23450-10-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- contrib/libvhost-user/libvhost-user.c | 63 +++++++++++++++++++++++++++ contrib/libvhost-user/libvhost-user.h | 1 + 2 files changed, 64 insertions(+) diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index d8ee7a23a3..386449b697 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -139,6 +139,7 @@ vu_request_to_string(unsigned int req) REQ(VHOST_USER_VRING_KICK), REQ(VHOST_USER_GET_MAX_MEM_SLOTS), REQ(VHOST_USER_ADD_MEM_REG), + REQ(VHOST_USER_REM_MEM_REG), REQ(VHOST_USER_MAX), }; #undef REQ @@ -763,6 +764,66 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { } } +static inline bool reg_equal(VuDevRegion *vudev_reg, + VhostUserMemoryRegion *msg_reg) +{ + if (vudev_reg->gpa == msg_reg->guest_phys_addr && + vudev_reg->qva == msg_reg->userspace_addr && + vudev_reg->size == msg_reg->memory_size) { + return true; + } + + return false; +} + +static bool +vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { + int i, j; + bool found = false; + VuDevRegion shadow_regions[VHOST_MEMORY_MAX_NREGIONS] = {}; + VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; + + DPRINT("Removing region:\n"); + DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", + msg_region->guest_phys_addr); + DPRINT(" memory_size: 0x%016"PRIx64"\n", + msg_region->memory_size); + DPRINT(" userspace_addr 0x%016"PRIx64"\n", + msg_region->userspace_addr); + DPRINT(" mmap_offset 0x%016"PRIx64"\n", + msg_region->mmap_offset); + + for (i = 0, j = 0; i < dev->nregions; i++) { + if (!reg_equal(&dev->regions[i], msg_region)) { + shadow_regions[j].gpa = dev->regions[i].gpa; + shadow_regions[j].size = dev->regions[i].size; + shadow_regions[j].qva = dev->regions[i].qva; + shadow_regions[j].mmap_offset = dev->regions[i].mmap_offset; + j++; + } else { + found = true; + VuDevRegion *r = &dev->regions[i]; + void *m = (void *) (uintptr_t) r->mmap_addr; + + if (m) { + munmap(m, r->size + r->mmap_offset); + } + } + } + + if (found) { + memcpy(dev->regions, shadow_regions, + sizeof(VuDevRegion) * VHOST_MEMORY_MAX_NREGIONS); + DPRINT("Successfully removed a region\n"); + dev->nregions--; + vmsg_set_reply_u64(vmsg, 0); + } else { + vu_panic(dev, "Specified region not found\n"); + } + + return true; +} + static bool vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) { @@ -1771,6 +1832,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg) return vu_handle_get_max_memslots(dev, vmsg); case VHOST_USER_ADD_MEM_REG: return vu_add_mem_reg(dev, vmsg); + case VHOST_USER_REM_MEM_REG: + return vu_rem_mem_reg(dev, vmsg); default: vmsg_close_fds(vmsg); vu_panic(dev, "Unhandled request: %d", vmsg->request); diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h index 60ef7fd13e..f8439713a8 100644 --- a/contrib/libvhost-user/libvhost-user.h +++ b/contrib/libvhost-user/libvhost-user.h @@ -99,6 +99,7 @@ typedef enum VhostUserRequest { VHOST_USER_VRING_KICK = 35, VHOST_USER_GET_MAX_MEM_SLOTS = 36, VHOST_USER_ADD_MEM_REG = 37, + VHOST_USER_REM_MEM_REG = 38, VHOST_USER_MAX } VhostUserRequest; From b650d5f4b1cd3f9f8c4fdb319838c5c1e0695e41 Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Thu, 21 May 2020 05:00:59 +0000 Subject: [PATCH 45/58] Lift max ram slots limit in libvhost-user Historically, VMs with vhost-user devices could hot-add memory a maximum of 8 times. Now that the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been added, VMs with vhost-user backends which support this new feature can support a configurable number of ram slots up to the maximum supported by the target platform. This change adds VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS support for backends built with libvhost-user, and increases the number of supported ram slots from 8 to 32. Memory hot-add, hot-remove and postcopy migration were tested with the vhost-user-bridge sample. Signed-off-by: Raphael Norwitz Message-Id: <1588533678-23450-11-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- contrib/libvhost-user/libvhost-user.c | 17 +++++++++-------- contrib/libvhost-user/libvhost-user.h | 15 +++++++++++---- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index 386449b697..b1e607298c 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -269,7 +269,7 @@ have_userfault(void) static bool vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) { - char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {}; struct iovec iov = { .iov_base = (char *)vmsg, .iov_len = VHOST_USER_HDR_SIZE, @@ -340,7 +340,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) { int rc; uint8_t *p = (uint8_t *)vmsg; - char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {}; struct iovec iov = { .iov_base = (char *)vmsg, .iov_len = VHOST_USER_HDR_SIZE, @@ -353,7 +353,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) struct cmsghdr *cmsg; memset(control, 0, sizeof(control)); - assert(vmsg->fd_num <= VHOST_MEMORY_MAX_NREGIONS); + assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); if (vmsg->fd_num > 0) { size_t fdsize = vmsg->fd_num * sizeof(int); msg.msg_controllen = CMSG_SPACE(fdsize); @@ -780,7 +780,7 @@ static bool vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { int i, j; bool found = false; - VuDevRegion shadow_regions[VHOST_MEMORY_MAX_NREGIONS] = {}; + VuDevRegion shadow_regions[VHOST_USER_MAX_RAM_SLOTS] = {}; VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; DPRINT("Removing region:\n"); @@ -813,7 +813,7 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { if (found) { memcpy(dev->regions, shadow_regions, - sizeof(VuDevRegion) * VHOST_MEMORY_MAX_NREGIONS); + sizeof(VuDevRegion) * VHOST_USER_MAX_RAM_SLOTS); DPRINT("Successfully removed a region\n"); dev->nregions--; vmsg_set_reply_u64(vmsg, 0); @@ -1394,7 +1394,8 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | - 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK; + 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | + 1ULL << VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS; if (have_userfault()) { features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT; @@ -1732,14 +1733,14 @@ static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg) { vmsg->flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION; vmsg->size = sizeof(vmsg->payload.u64); - vmsg->payload.u64 = VHOST_MEMORY_MAX_NREGIONS; + vmsg->payload.u64 = VHOST_USER_MAX_RAM_SLOTS; vmsg->fd_num = 0; if (!vu_message_write(dev, dev->sock, vmsg)) { vu_panic(dev, "Failed to send max ram slots: %s\n", strerror(errno)); } - DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_MEMORY_MAX_NREGIONS); + DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_USER_MAX_RAM_SLOTS); return false; } diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h index f8439713a8..844c37c648 100644 --- a/contrib/libvhost-user/libvhost-user.h +++ b/contrib/libvhost-user/libvhost-user.h @@ -28,7 +28,13 @@ #define VIRTQUEUE_MAX_SIZE 1024 -#define VHOST_MEMORY_MAX_NREGIONS 8 +#define VHOST_MEMORY_BASELINE_NREGIONS 8 + +/* + * Set a reasonable maximum number of ram slots, which will be supported by + * any architecture. + */ +#define VHOST_USER_MAX_RAM_SLOTS 32 typedef enum VhostSetConfigType { VHOST_SET_CONFIG_TYPE_MASTER = 0, @@ -55,6 +61,7 @@ enum VhostUserProtocolFeature { VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, VHOST_USER_PROTOCOL_F_MAX }; @@ -123,7 +130,7 @@ typedef struct VhostUserMemoryRegion { typedef struct VhostUserMemory { uint32_t nregions; uint32_t padding; - VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; + VhostUserMemoryRegion regions[VHOST_MEMORY_BASELINE_NREGIONS]; } VhostUserMemory; typedef struct VhostUserMemRegMsg { @@ -190,7 +197,7 @@ typedef struct VhostUserMsg { VhostUserInflight inflight; } payload; - int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fds[VHOST_MEMORY_BASELINE_NREGIONS]; int fd_num; uint8_t *data; } VU_PACKED VhostUserMsg; @@ -368,7 +375,7 @@ typedef struct VuDevInflightInfo { struct VuDev { int sock; uint32_t nregions; - VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS]; + VuDevRegion regions[VHOST_USER_MAX_RAM_SLOTS]; VuVirtq *vq; VuDevInflightInfo inflight_info; int log_call_fd; From a9a5c473d25d83bcdad27d97e5fb451950ee894a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 29 May 2020 17:13:38 +0100 Subject: [PATCH 46/58] libvhost-user: advertise vring features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit libvhost-user implements several vring features without advertising them. There is no way for the vhost-user master to detect support for these features. Things more or less work today because QEMU assumes the vhost-user backend always implements certain feature bits like VIRTIO_RING_F_EVENT_IDX. This is not documented anywhere. This patch explicitly advertises features implemented in libvhost-user so that the vhost-user master does not need to make undocumented assumptions. Feature bits that libvhost-user now advertises can be removed from vhost-user-blk.c. Devices should not be responsible for advertising vring feature bits, that is libvhost-user's job. Cc: Marc-André Lureau Cc: Jason Wang Cc: Michael S. Tsirkin Signed-off-by: Stefan Hajnoczi Message-Id: <20200529161338.456017-1-stefanha@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Marc-André Lureau --- contrib/libvhost-user/libvhost-user.c | 10 ++++++++++ contrib/vhost-user-blk/vhost-user-blk.c | 4 +--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index b1e607298c..d315db1396 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -498,6 +498,16 @@ static bool vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg) { vmsg->payload.u64 = + /* + * The following VIRTIO feature bits are supported by our virtqueue + * implementation: + */ + 1ULL << VIRTIO_F_NOTIFY_ON_EMPTY | + 1ULL << VIRTIO_RING_F_INDIRECT_DESC | + 1ULL << VIRTIO_RING_F_EVENT_IDX | + 1ULL << VIRTIO_F_VERSION_1 | + + /* vhost-user feature bits */ 1ULL << VHOST_F_LOG_ALL | 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index 6fd91c7e99..25eccd02b5 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -382,9 +382,7 @@ vub_get_features(VuDev *dev) 1ull << VIRTIO_BLK_F_DISCARD | 1ull << VIRTIO_BLK_F_WRITE_ZEROES | #endif - 1ull << VIRTIO_BLK_F_CONFIG_WCE | - 1ull << VIRTIO_F_VERSION_1 | - 1ull << VHOST_USER_F_PROTOCOL_FEATURES; + 1ull << VIRTIO_BLK_F_CONFIG_WCE; if (vdev_blk->enable_ro) { features |= 1ull << VIRTIO_BLK_F_RO; From 00823980b296c80a3ff7b448df20c48897cd62e9 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 27 May 2020 17:31:52 +0200 Subject: [PATCH 47/58] hw/pci: Fix crash when running QEMU with "-nic model=rocker" QEMU currently aborts when being started with "-nic model=rocker" or with "-net nic,model=rocker". This happens because the "rocker" device is not a normal NIC but a switch, which has different properties. Thus we should only consider real NIC devices for "-nic" and "-net". These devices can be identified by the "netdev" property, so check for this property before adding the device to the list. Reported-by: Michael Tokarev Fixes: 52310c3fa7dc854d ("net: allow using any PCI NICs in -net or -nic") Signed-off-by: Thomas Huth Message-Id: <20200527153152.9211-1-thuth@redhat.com> Reviewed-by: Paolo Bonzini Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pci.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 7bf2ae6d92..1b88a32cf7 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -1891,7 +1891,18 @@ PCIDevice *pci_nic_init_nofail(NICInfo *nd, PCIBus *rootbus, if (test_bit(DEVICE_CATEGORY_NETWORK, dc->categories) && dc->user_creatable) { const char *name = object_class_get_name(list->data); - g_ptr_array_add(pci_nic_models, (gpointer)name); + /* + * A network device might also be something else than a NIC, see + * e.g. the "rocker" device. Thus we have to look for the "netdev" + * property, too. Unfortunately, some devices like virtio-net only + * create this property during instance_init, so we have to create + * a temporary instance here to be able to check it. + */ + Object *obj = object_new_with_class(OBJECT_CLASS(dc)); + if (object_property_find(obj, "netdev", NULL)) { + g_ptr_array_add(pci_nic_models, (gpointer)name); + } + object_unref(obj); } next = list->next; g_slist_free_1(list); From c6136ec0c6fea0db638de08720018fd4bc777787 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 22 May 2020 14:25:10 +0200 Subject: [PATCH 48/58] vhost-vsock: add vhost-vsock-common abstraction This patch prepares the introduction of vhost-user-vsock, moving the common code usable for both vhost-vsock and vhost-user-vsock devices, in the new vhost-vsock-common parent class. While moving the code, fixed checkpatch warnings about block comments. Signed-off-by: Stefano Garzarella Message-Id: <20200522122512.87413-2-sgarzare@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/Makefile.objs | 2 +- hw/virtio/vhost-vsock-common.c | 258 ++++++++++++++++++++++ hw/virtio/vhost-vsock.c | 283 ++++--------------------- include/hw/virtio/vhost-vsock-common.h | 47 ++++ include/hw/virtio/vhost-vsock.h | 11 +- 5 files changed, 350 insertions(+), 251 deletions(-) create mode 100644 hw/virtio/vhost-vsock-common.c create mode 100644 include/hw/virtio/vhost-vsock-common.h diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs index 4e4d39a0a4..b1eeb44eac 100644 --- a/hw/virtio/Makefile.objs +++ b/hw/virtio/Makefile.objs @@ -17,7 +17,7 @@ obj-$(CONFIG_VIRTIO_PMEM) += virtio-pmem.o common-obj-$(call land,$(CONFIG_VIRTIO_PMEM),$(CONFIG_VIRTIO_PCI)) += virtio-pmem-pci.o obj-$(call land,$(CONFIG_VHOST_USER_FS),$(CONFIG_VIRTIO_PCI)) += vhost-user-fs-pci.o obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o -obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o +obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-common.o vhost-vsock.o ifeq ($(CONFIG_VIRTIO_PCI),y) obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-pci.o diff --git a/hw/virtio/vhost-vsock-common.c b/hw/virtio/vhost-vsock-common.c new file mode 100644 index 0000000000..5b2ebf3496 --- /dev/null +++ b/hw/virtio/vhost-vsock-common.c @@ -0,0 +1,258 @@ +/* + * Parent class for vhost-vsock devices + * + * Copyright 2015-2020 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" +#include "standard-headers/linux/virtio_vsock.h" +#include "qapi/error.h" +#include "hw/virtio/virtio-access.h" +#include "qemu/error-report.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost-vsock.h" +#include "qemu/iov.h" +#include "monitor/monitor.h" + +int vhost_vsock_common_start(VirtIODevice *vdev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int ret; + int i; + + if (!k->set_guest_notifiers) { + error_report("binding does not support guest notifiers"); + return -ENOSYS; + } + + ret = vhost_dev_enable_notifiers(&vvc->vhost_dev, vdev); + if (ret < 0) { + error_report("Error enabling host notifiers: %d", -ret); + return ret; + } + + ret = k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, true); + if (ret < 0) { + error_report("Error binding guest notifier: %d", -ret); + goto err_host_notifiers; + } + + vvc->vhost_dev.acked_features = vdev->guest_features; + ret = vhost_dev_start(&vvc->vhost_dev, vdev); + if (ret < 0) { + error_report("Error starting vhost: %d", -ret); + goto err_guest_notifiers; + } + + /* + * guest_notifier_mask/pending not used yet, so just unmask + * everything here. virtio-pci will do the right thing by + * enabling/disabling irqfd. + */ + for (i = 0; i < vvc->vhost_dev.nvqs; i++) { + vhost_virtqueue_mask(&vvc->vhost_dev, vdev, i, false); + } + + return 0; + +err_guest_notifiers: + k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, false); +err_host_notifiers: + vhost_dev_disable_notifiers(&vvc->vhost_dev, vdev); + return ret; +} + +void vhost_vsock_common_stop(VirtIODevice *vdev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int ret; + + if (!k->set_guest_notifiers) { + return; + } + + vhost_dev_stop(&vvc->vhost_dev, vdev); + + ret = k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, false); + if (ret < 0) { + error_report("vhost guest notifier cleanup failed: %d", ret); + return; + } + + vhost_dev_disable_notifiers(&vvc->vhost_dev, vdev); +} + + +static void vhost_vsock_common_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + /* Do nothing */ +} + +static void vhost_vsock_common_guest_notifier_mask(VirtIODevice *vdev, int idx, + bool mask) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + + vhost_virtqueue_mask(&vvc->vhost_dev, vdev, idx, mask); +} + +static bool vhost_vsock_common_guest_notifier_pending(VirtIODevice *vdev, + int idx) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + + return vhost_virtqueue_pending(&vvc->vhost_dev, idx); +} + +static void vhost_vsock_common_send_transport_reset(VHostVSockCommon *vvc) +{ + VirtQueueElement *elem; + VirtQueue *vq = vvc->event_vq; + struct virtio_vsock_event event = { + .id = cpu_to_le32(VIRTIO_VSOCK_EVENT_TRANSPORT_RESET), + }; + + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + error_report("vhost-vsock missed transport reset event"); + return; + } + + if (elem->out_num) { + error_report("invalid vhost-vsock event virtqueue element with " + "out buffers"); + goto out; + } + + if (iov_from_buf(elem->in_sg, elem->in_num, 0, + &event, sizeof(event)) != sizeof(event)) { + error_report("vhost-vsock event virtqueue element is too short"); + goto out; + } + + virtqueue_push(vq, elem, sizeof(event)); + virtio_notify(VIRTIO_DEVICE(vvc), vq); + +out: + g_free(elem); +} + +static void vhost_vsock_common_post_load_timer_cleanup(VHostVSockCommon *vvc) +{ + if (!vvc->post_load_timer) { + return; + } + + timer_del(vvc->post_load_timer); + timer_free(vvc->post_load_timer); + vvc->post_load_timer = NULL; +} + +static void vhost_vsock_common_post_load_timer_cb(void *opaque) +{ + VHostVSockCommon *vvc = opaque; + + vhost_vsock_common_post_load_timer_cleanup(vvc); + vhost_vsock_common_send_transport_reset(vvc); +} + +int vhost_vsock_common_pre_save(void *opaque) +{ + VHostVSockCommon *vvc = opaque; + + /* + * At this point, backend must be stopped, otherwise + * it might keep writing to memory. + */ + assert(!vvc->vhost_dev.started); + + return 0; +} + +int vhost_vsock_common_post_load(void *opaque, int version_id) +{ + VHostVSockCommon *vvc = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(vvc); + + if (virtio_queue_get_addr(vdev, 2)) { + /* + * Defer transport reset event to a vm clock timer so that virtqueue + * changes happen after migration has completed. + */ + assert(!vvc->post_load_timer); + vvc->post_load_timer = + timer_new_ns(QEMU_CLOCK_VIRTUAL, + vhost_vsock_common_post_load_timer_cb, + vvc); + timer_mod(vvc->post_load_timer, 1); + } + return 0; +} + +void vhost_vsock_common_realize(VirtIODevice *vdev, const char *name) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + + virtio_init(vdev, name, VIRTIO_ID_VSOCK, + sizeof(struct virtio_vsock_config)); + + /* Receive and transmit queues belong to vhost */ + vvc->recv_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, + vhost_vsock_common_handle_output); + vvc->trans_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, + vhost_vsock_common_handle_output); + + /* The event queue belongs to QEMU */ + vvc->event_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, + vhost_vsock_common_handle_output); + + vvc->vhost_dev.nvqs = ARRAY_SIZE(vvc->vhost_vqs); + vvc->vhost_dev.vqs = vvc->vhost_vqs; + + vvc->post_load_timer = NULL; +} + +void vhost_vsock_common_unrealize(VirtIODevice *vdev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + + vhost_vsock_common_post_load_timer_cleanup(vvc); + + virtio_delete_queue(vvc->recv_vq); + virtio_delete_queue(vvc->trans_vq); + virtio_delete_queue(vvc->event_vq); + virtio_cleanup(vdev); +} + +static void vhost_vsock_common_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + vdc->guest_notifier_mask = vhost_vsock_common_guest_notifier_mask; + vdc->guest_notifier_pending = vhost_vsock_common_guest_notifier_pending; +} + +static const TypeInfo vhost_vsock_common_info = { + .name = TYPE_VHOST_VSOCK_COMMON, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VHostVSockCommon), + .class_init = vhost_vsock_common_class_init, + .abstract = true, +}; + +static void vhost_vsock_common_register_types(void) +{ + type_register_static(&vhost_vsock_common_info); +} + +type_init(vhost_vsock_common_register_types) diff --git a/hw/virtio/vhost-vsock.c b/hw/virtio/vhost-vsock.c index 4a228f5168..c8f0699b4f 100644 --- a/hw/virtio/vhost-vsock.c +++ b/hw/virtio/vhost-vsock.c @@ -12,24 +12,14 @@ */ #include "qemu/osdep.h" -#include #include "standard-headers/linux/virtio_vsock.h" #include "qapi/error.h" -#include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" #include "qemu/error-report.h" #include "hw/qdev-properties.h" #include "hw/virtio/vhost-vsock.h" -#include "qemu/iov.h" -#include "qemu/module.h" #include "monitor/monitor.h" -enum { - VHOST_VSOCK_SAVEVM_VERSION = 0, - - VHOST_VSOCK_QUEUE_SIZE = 128, -}; - static void vhost_vsock_get_config(VirtIODevice *vdev, uint8_t *config) { VHostVSock *vsock = VHOST_VSOCK(vdev); @@ -39,16 +29,18 @@ static void vhost_vsock_get_config(VirtIODevice *vdev, uint8_t *config) memcpy(config, &vsockcfg, sizeof(vsockcfg)); } -static int vhost_vsock_set_guest_cid(VHostVSock *vsock) +static int vhost_vsock_set_guest_cid(VirtIODevice *vdev) { - const VhostOps *vhost_ops = vsock->vhost_dev.vhost_ops; + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + VHostVSock *vsock = VHOST_VSOCK(vdev); + const VhostOps *vhost_ops = vvc->vhost_dev.vhost_ops; int ret; if (!vhost_ops->vhost_vsock_set_guest_cid) { return -ENOSYS; } - ret = vhost_ops->vhost_vsock_set_guest_cid(&vsock->vhost_dev, + ret = vhost_ops->vhost_vsock_set_guest_cid(&vvc->vhost_dev, vsock->conf.guest_cid); if (ret < 0) { return -errno; @@ -56,123 +48,58 @@ static int vhost_vsock_set_guest_cid(VHostVSock *vsock) return 0; } -static int vhost_vsock_set_running(VHostVSock *vsock, int start) +static int vhost_vsock_set_running(VirtIODevice *vdev, int start) { - const VhostOps *vhost_ops = vsock->vhost_dev.vhost_ops; + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + const VhostOps *vhost_ops = vvc->vhost_dev.vhost_ops; int ret; if (!vhost_ops->vhost_vsock_set_running) { return -ENOSYS; } - ret = vhost_ops->vhost_vsock_set_running(&vsock->vhost_dev, start); + ret = vhost_ops->vhost_vsock_set_running(&vvc->vhost_dev, start); if (ret < 0) { return -errno; } return 0; } -static void vhost_vsock_start(VirtIODevice *vdev) -{ - VHostVSock *vsock = VHOST_VSOCK(vdev); - BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); - VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); - int ret; - int i; - - if (!k->set_guest_notifiers) { - error_report("binding does not support guest notifiers"); - return; - } - - ret = vhost_dev_enable_notifiers(&vsock->vhost_dev, vdev); - if (ret < 0) { - error_report("Error enabling host notifiers: %d", -ret); - return; - } - - ret = k->set_guest_notifiers(qbus->parent, vsock->vhost_dev.nvqs, true); - if (ret < 0) { - error_report("Error binding guest notifier: %d", -ret); - goto err_host_notifiers; - } - - vsock->vhost_dev.acked_features = vdev->guest_features; - ret = vhost_dev_start(&vsock->vhost_dev, vdev); - if (ret < 0) { - error_report("Error starting vhost: %d", -ret); - goto err_guest_notifiers; - } - - ret = vhost_vsock_set_running(vsock, 1); - if (ret < 0) { - error_report("Error starting vhost vsock: %d", -ret); - goto err_dev_start; - } - - /* guest_notifier_mask/pending not used yet, so just unmask - * everything here. virtio-pci will do the right thing by - * enabling/disabling irqfd. - */ - for (i = 0; i < vsock->vhost_dev.nvqs; i++) { - vhost_virtqueue_mask(&vsock->vhost_dev, vdev, i, false); - } - - return; - -err_dev_start: - vhost_dev_stop(&vsock->vhost_dev, vdev); -err_guest_notifiers: - k->set_guest_notifiers(qbus->parent, vsock->vhost_dev.nvqs, false); -err_host_notifiers: - vhost_dev_disable_notifiers(&vsock->vhost_dev, vdev); -} - -static void vhost_vsock_stop(VirtIODevice *vdev) -{ - VHostVSock *vsock = VHOST_VSOCK(vdev); - BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); - VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); - int ret; - - if (!k->set_guest_notifiers) { - return; - } - - ret = vhost_vsock_set_running(vsock, 0); - if (ret < 0) { - error_report("vhost vsock set running failed: %d", ret); - return; - } - - vhost_dev_stop(&vsock->vhost_dev, vdev); - - ret = k->set_guest_notifiers(qbus->parent, vsock->vhost_dev.nvqs, false); - if (ret < 0) { - error_report("vhost guest notifier cleanup failed: %d", ret); - return; - } - - vhost_dev_disable_notifiers(&vsock->vhost_dev, vdev); -} static void vhost_vsock_set_status(VirtIODevice *vdev, uint8_t status) { - VHostVSock *vsock = VHOST_VSOCK(vdev); + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); bool should_start = status & VIRTIO_CONFIG_S_DRIVER_OK; + int ret; if (!vdev->vm_running) { should_start = false; } - if (vsock->vhost_dev.started == should_start) { + if (vvc->vhost_dev.started == should_start) { return; } if (should_start) { - vhost_vsock_start(vdev); + ret = vhost_vsock_common_start(vdev); + if (ret < 0) { + return; + } + + ret = vhost_vsock_set_running(vdev, 1); + if (ret < 0) { + vhost_vsock_common_stop(vdev); + error_report("Error starting vhost vsock: %d", -ret); + return; + } } else { - vhost_vsock_stop(vdev); + ret = vhost_vsock_set_running(vdev, 0); + if (ret < 0) { + error_report("vhost vsock set running failed: %d", ret); + return; + } + + vhost_vsock_common_stop(vdev); } } @@ -184,108 +111,6 @@ static uint64_t vhost_vsock_get_features(VirtIODevice *vdev, return requested_features; } -static void vhost_vsock_handle_output(VirtIODevice *vdev, VirtQueue *vq) -{ - /* Do nothing */ -} - -static void vhost_vsock_guest_notifier_mask(VirtIODevice *vdev, int idx, - bool mask) -{ - VHostVSock *vsock = VHOST_VSOCK(vdev); - - vhost_virtqueue_mask(&vsock->vhost_dev, vdev, idx, mask); -} - -static bool vhost_vsock_guest_notifier_pending(VirtIODevice *vdev, int idx) -{ - VHostVSock *vsock = VHOST_VSOCK(vdev); - - return vhost_virtqueue_pending(&vsock->vhost_dev, idx); -} - -static void vhost_vsock_send_transport_reset(VHostVSock *vsock) -{ - VirtQueueElement *elem; - VirtQueue *vq = vsock->event_vq; - struct virtio_vsock_event event = { - .id = cpu_to_le32(VIRTIO_VSOCK_EVENT_TRANSPORT_RESET), - }; - - elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); - if (!elem) { - error_report("vhost-vsock missed transport reset event"); - return; - } - - if (elem->out_num) { - error_report("invalid vhost-vsock event virtqueue element with " - "out buffers"); - goto out; - } - - if (iov_from_buf(elem->in_sg, elem->in_num, 0, - &event, sizeof(event)) != sizeof(event)) { - error_report("vhost-vsock event virtqueue element is too short"); - goto out; - } - - virtqueue_push(vq, elem, sizeof(event)); - virtio_notify(VIRTIO_DEVICE(vsock), vq); - -out: - g_free(elem); -} - -static void vhost_vsock_post_load_timer_cleanup(VHostVSock *vsock) -{ - if (!vsock->post_load_timer) { - return; - } - - timer_del(vsock->post_load_timer); - timer_free(vsock->post_load_timer); - vsock->post_load_timer = NULL; -} - -static void vhost_vsock_post_load_timer_cb(void *opaque) -{ - VHostVSock *vsock = opaque; - - vhost_vsock_post_load_timer_cleanup(vsock); - vhost_vsock_send_transport_reset(vsock); -} - -static int vhost_vsock_pre_save(void *opaque) -{ - VHostVSock *vsock = opaque; - - /* At this point, backend must be stopped, otherwise - * it might keep writing to memory. */ - assert(!vsock->vhost_dev.started); - - return 0; -} - -static int vhost_vsock_post_load(void *opaque, int version_id) -{ - VHostVSock *vsock = opaque; - VirtIODevice *vdev = VIRTIO_DEVICE(vsock); - - if (virtio_queue_get_addr(vdev, 2)) { - /* Defer transport reset event to a vm clock timer so that virtqueue - * changes happen after migration has completed. - */ - assert(!vsock->post_load_timer); - vsock->post_load_timer = - timer_new_ns(QEMU_CLOCK_VIRTUAL, - vhost_vsock_post_load_timer_cb, - vsock); - timer_mod(vsock->post_load_timer, 1); - } - return 0; -} - static const VMStateDescription vmstate_virtio_vhost_vsock = { .name = "virtio-vhost_vsock", .minimum_version_id = VHOST_VSOCK_SAVEVM_VERSION, @@ -294,12 +119,13 @@ static const VMStateDescription vmstate_virtio_vhost_vsock = { VMSTATE_VIRTIO_DEVICE, VMSTATE_END_OF_LIST() }, - .pre_save = vhost_vsock_pre_save, - .post_load = vhost_vsock_post_load, + .pre_save = vhost_vsock_common_pre_save, + .post_load = vhost_vsock_common_post_load, }; static void vhost_vsock_device_realize(DeviceState *dev, Error **errp) { + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev); VirtIODevice *vdev = VIRTIO_DEVICE(dev); VHostVSock *vsock = VHOST_VSOCK(dev); int vhostfd; @@ -331,46 +157,29 @@ static void vhost_vsock_device_realize(DeviceState *dev, Error **errp) } } - virtio_init(vdev, "vhost-vsock", VIRTIO_ID_VSOCK, - sizeof(struct virtio_vsock_config)); + vhost_vsock_common_realize(vdev, "vhost-vsock"); - /* Receive and transmit queues belong to vhost */ - vsock->recv_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, - vhost_vsock_handle_output); - vsock->trans_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, - vhost_vsock_handle_output); - - /* The event queue belongs to QEMU */ - vsock->event_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, - vhost_vsock_handle_output); - - vsock->vhost_dev.nvqs = ARRAY_SIZE(vsock->vhost_vqs); - vsock->vhost_dev.vqs = vsock->vhost_vqs; - ret = vhost_dev_init(&vsock->vhost_dev, (void *)(uintptr_t)vhostfd, + ret = vhost_dev_init(&vvc->vhost_dev, (void *)(uintptr_t)vhostfd, VHOST_BACKEND_TYPE_KERNEL, 0); if (ret < 0) { error_setg_errno(errp, -ret, "vhost-vsock: vhost_dev_init failed"); goto err_virtio; } - ret = vhost_vsock_set_guest_cid(vsock); + ret = vhost_vsock_set_guest_cid(vdev); if (ret < 0) { error_setg_errno(errp, -ret, "vhost-vsock: unable to set guest cid"); goto err_vhost_dev; } - vsock->post_load_timer = NULL; return; err_vhost_dev: - vhost_dev_cleanup(&vsock->vhost_dev); + vhost_dev_cleanup(&vvc->vhost_dev); /* vhost_dev_cleanup() closes the vhostfd passed to vhost_dev_init() */ vhostfd = -1; err_virtio: - virtio_delete_queue(vsock->recv_vq); - virtio_delete_queue(vsock->trans_vq); - virtio_delete_queue(vsock->event_vq); - virtio_cleanup(vdev); + vhost_vsock_common_unrealize(vdev); if (vhostfd >= 0) { close(vhostfd); } @@ -379,19 +188,14 @@ err_virtio: static void vhost_vsock_device_unrealize(DeviceState *dev) { + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev); VirtIODevice *vdev = VIRTIO_DEVICE(dev); - VHostVSock *vsock = VHOST_VSOCK(dev); - - vhost_vsock_post_load_timer_cleanup(vsock); /* This will stop vhost backend if appropriate. */ vhost_vsock_set_status(vdev, 0); - vhost_dev_cleanup(&vsock->vhost_dev); - virtio_delete_queue(vsock->recv_vq); - virtio_delete_queue(vsock->trans_vq); - virtio_delete_queue(vsock->event_vq); - virtio_cleanup(vdev); + vhost_dev_cleanup(&vvc->vhost_dev); + vhost_vsock_common_unrealize(vdev); } static Property vhost_vsock_properties[] = { @@ -407,19 +211,16 @@ static void vhost_vsock_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, vhost_vsock_properties); dc->vmsd = &vmstate_virtio_vhost_vsock; - set_bit(DEVICE_CATEGORY_MISC, dc->categories); vdc->realize = vhost_vsock_device_realize; vdc->unrealize = vhost_vsock_device_unrealize; vdc->get_features = vhost_vsock_get_features; vdc->get_config = vhost_vsock_get_config; vdc->set_status = vhost_vsock_set_status; - vdc->guest_notifier_mask = vhost_vsock_guest_notifier_mask; - vdc->guest_notifier_pending = vhost_vsock_guest_notifier_pending; } static const TypeInfo vhost_vsock_info = { .name = TYPE_VHOST_VSOCK, - .parent = TYPE_VIRTIO_DEVICE, + .parent = TYPE_VHOST_VSOCK_COMMON, .instance_size = sizeof(VHostVSock), .class_init = vhost_vsock_class_init, }; diff --git a/include/hw/virtio/vhost-vsock-common.h b/include/hw/virtio/vhost-vsock-common.h new file mode 100644 index 0000000000..f8b4aaae00 --- /dev/null +++ b/include/hw/virtio/vhost-vsock-common.h @@ -0,0 +1,47 @@ +/* + * Parent class for vhost-vsock devices + * + * Copyright 2015-2020 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#ifndef _QEMU_VHOST_VSOCK_COMMON_H +#define _QEMU_VHOST_VSOCK_COMMON_H + +#include "hw/virtio/virtio.h" +#include "hw/virtio/vhost.h" + +#define TYPE_VHOST_VSOCK_COMMON "vhost-vsock-common" +#define VHOST_VSOCK_COMMON(obj) \ + OBJECT_CHECK(VHostVSockCommon, (obj), TYPE_VHOST_VSOCK_COMMON) + +enum { + VHOST_VSOCK_SAVEVM_VERSION = 0, + + VHOST_VSOCK_QUEUE_SIZE = 128, +}; + +typedef struct { + VirtIODevice parent; + + struct vhost_virtqueue vhost_vqs[2]; + struct vhost_dev vhost_dev; + + VirtQueue *event_vq; + VirtQueue *recv_vq; + VirtQueue *trans_vq; + + QEMUTimer *post_load_timer; +} VHostVSockCommon; + +int vhost_vsock_common_start(VirtIODevice *vdev); +void vhost_vsock_common_stop(VirtIODevice *vdev); +int vhost_vsock_common_pre_save(void *opaque); +int vhost_vsock_common_post_load(void *opaque, int version_id); +void vhost_vsock_common_realize(VirtIODevice *vdev, const char *name); +void vhost_vsock_common_unrealize(VirtIODevice *vdev); + +#endif /* _QEMU_VHOST_VSOCK_COMMON_H */ diff --git a/include/hw/virtio/vhost-vsock.h b/include/hw/virtio/vhost-vsock.h index bc5a988ee5..8cbb7b90f9 100644 --- a/include/hw/virtio/vhost-vsock.h +++ b/include/hw/virtio/vhost-vsock.h @@ -14,8 +14,7 @@ #ifndef QEMU_VHOST_VSOCK_H #define QEMU_VHOST_VSOCK_H -#include "hw/virtio/virtio.h" -#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-vsock-common.h" #define TYPE_VHOST_VSOCK "vhost-vsock-device" #define VHOST_VSOCK(obj) \ @@ -28,14 +27,8 @@ typedef struct { typedef struct { /*< private >*/ - VirtIODevice parent; + VHostVSockCommon parent; VHostVSockConf conf; - struct vhost_virtqueue vhost_vqs[2]; - struct vhost_dev vhost_dev; - VirtQueue *event_vq; - VirtQueue *recv_vq; - VirtQueue *trans_vq; - QEMUTimer *post_load_timer; /*< public >*/ } VHostVSock; From 5fe97d8829e8bc2a297474df20cd3ac12eebdbba Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 22 May 2020 14:25:11 +0200 Subject: [PATCH 49/58] virtio: add vhost-user-vsock base device This patch introduces a vhost-user device for vsock, using the vhost-vsock-common parent class. The vhost-user-vsock device can be used to implement the virtio-vsock device emulation in user-space. Signed-off-by: Stefano Garzarella Message-Id: <20200522122512.87413-3-sgarzare@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- configure | 3 + hw/virtio/Makefile.objs | 1 + hw/virtio/vhost-user-vsock.c | 181 +++++++++++++++++++++++++++ include/hw/virtio/vhost-user-vsock.h | 36 ++++++ 4 files changed, 221 insertions(+) create mode 100644 hw/virtio/vhost-user-vsock.c create mode 100644 include/hw/virtio/vhost-user-vsock.h diff --git a/configure b/configure index 597e909b53..7c2adf36e5 100755 --- a/configure +++ b/configure @@ -7196,6 +7196,9 @@ if test "$vhost_crypto" = "yes" ; then fi if test "$vhost_vsock" = "yes" ; then echo "CONFIG_VHOST_VSOCK=y" >> $config_host_mak + if test "$vhost_user" = "yes" ; then + echo "CONFIG_VHOST_USER_VSOCK=y" >> $config_host_mak + fi fi if test "$vhost_kernel" = "yes" ; then echo "CONFIG_VHOST_KERNEL=y" >> $config_host_mak diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs index b1eeb44eac..dd42daedb1 100644 --- a/hw/virtio/Makefile.objs +++ b/hw/virtio/Makefile.objs @@ -18,6 +18,7 @@ common-obj-$(call land,$(CONFIG_VIRTIO_PMEM),$(CONFIG_VIRTIO_PCI)) += virtio-pme obj-$(call land,$(CONFIG_VHOST_USER_FS),$(CONFIG_VIRTIO_PCI)) += vhost-user-fs-pci.o obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-common.o vhost-vsock.o +obj-$(CONFIG_VHOST_USER_VSOCK) += vhost-vsock-common.o vhost-user-vsock.o ifeq ($(CONFIG_VIRTIO_PCI),y) obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-pci.o diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c new file mode 100644 index 0000000000..3534a39d62 --- /dev/null +++ b/hw/virtio/vhost-user-vsock.c @@ -0,0 +1,181 @@ +/* + * Vhost-user vsock virtio device + * + * Copyright 2020 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" + +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost-user-vsock.h" + +static const int user_feature_bits[] = { + VIRTIO_F_VERSION_1, + VIRTIO_RING_F_INDIRECT_DESC, + VIRTIO_RING_F_EVENT_IDX, + VIRTIO_F_NOTIFY_ON_EMPTY, + VHOST_INVALID_FEATURE_BIT +}; + +static void vuv_get_config(VirtIODevice *vdev, uint8_t *config) +{ + VHostUserVSock *vsock = VHOST_USER_VSOCK(vdev); + + memcpy(config, &vsock->vsockcfg, sizeof(struct virtio_vsock_config)); +} + +static int vuv_handle_config_change(struct vhost_dev *dev) +{ + VHostUserVSock *vsock = VHOST_USER_VSOCK(dev->vdev); + int ret = vhost_dev_get_config(dev, (uint8_t *)&vsock->vsockcfg, + sizeof(struct virtio_vsock_config)); + if (ret < 0) { + error_report("get config space failed"); + return -1; + } + + virtio_notify_config(dev->vdev); + + return 0; +} + +const VhostDevConfigOps vsock_ops = { + .vhost_dev_config_notifier = vuv_handle_config_change, +}; + +static void vuv_set_status(VirtIODevice *vdev, uint8_t status) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + bool should_start = status & VIRTIO_CONFIG_S_DRIVER_OK; + + if (!vdev->vm_running) { + should_start = false; + } + + if (vvc->vhost_dev.started == should_start) { + return; + } + + if (should_start) { + int ret = vhost_vsock_common_start(vdev); + if (ret < 0) { + return; + } + } else { + vhost_vsock_common_stop(vdev); + } +} + +static uint64_t vuv_get_features(VirtIODevice *vdev, + uint64_t features, + Error **errp) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + + return vhost_get_features(&vvc->vhost_dev, user_feature_bits, features); +} + +static const VMStateDescription vuv_vmstate = { + .name = "vhost-user-vsock", + .unmigratable = 1, +}; + +static void vuv_device_realize(DeviceState *dev, Error **errp) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev); + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserVSock *vsock = VHOST_USER_VSOCK(dev); + int ret; + + if (!vsock->conf.chardev.chr) { + error_setg(errp, "missing chardev"); + return; + } + + if (!vhost_user_init(&vsock->vhost_user, &vsock->conf.chardev, errp)) { + return; + } + + vhost_vsock_common_realize(vdev, "vhost-user-vsock"); + + vhost_dev_set_config_notifier(&vvc->vhost_dev, &vsock_ops); + + ret = vhost_dev_init(&vvc->vhost_dev, &vsock->vhost_user, + VHOST_BACKEND_TYPE_USER, 0); + if (ret < 0) { + error_setg_errno(errp, -ret, "vhost_dev_init failed"); + goto err_virtio; + } + + ret = vhost_dev_get_config(&vvc->vhost_dev, (uint8_t *)&vsock->vsockcfg, + sizeof(struct virtio_vsock_config)); + if (ret < 0) { + error_setg_errno(errp, -ret, "get config space failed"); + goto err_vhost_dev; + } + + return; + +err_vhost_dev: + vhost_dev_cleanup(&vvc->vhost_dev); +err_virtio: + vhost_vsock_common_unrealize(vdev); + vhost_user_cleanup(&vsock->vhost_user); + return; +} + +static void vuv_device_unrealize(DeviceState *dev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev); + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserVSock *vsock = VHOST_USER_VSOCK(dev); + + /* This will stop vhost backend if appropriate. */ + vuv_set_status(vdev, 0); + + vhost_dev_cleanup(&vvc->vhost_dev); + + vhost_vsock_common_unrealize(vdev); + + vhost_user_cleanup(&vsock->vhost_user); + +} + +static Property vuv_properties[] = { + DEFINE_PROP_CHR("chardev", VHostUserVSock, conf.chardev), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vuv_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, vuv_properties); + dc->vmsd = &vuv_vmstate; + vdc->realize = vuv_device_realize; + vdc->unrealize = vuv_device_unrealize; + vdc->get_features = vuv_get_features; + vdc->get_config = vuv_get_config; + vdc->set_status = vuv_set_status; +} + +static const TypeInfo vuv_info = { + .name = TYPE_VHOST_USER_VSOCK, + .parent = TYPE_VHOST_VSOCK_COMMON, + .instance_size = sizeof(VHostUserVSock), + .class_init = vuv_class_init, +}; + +static void vuv_register_types(void) +{ + type_register_static(&vuv_info); +} + +type_init(vuv_register_types) diff --git a/include/hw/virtio/vhost-user-vsock.h b/include/hw/virtio/vhost-user-vsock.h new file mode 100644 index 0000000000..4e128a4b9f --- /dev/null +++ b/include/hw/virtio/vhost-user-vsock.h @@ -0,0 +1,36 @@ +/* + * Vhost-user vsock virtio device + * + * Copyright 2020 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#ifndef _QEMU_VHOST_USER_VSOCK_H +#define _QEMU_VHOST_USER_VSOCK_H + +#include "hw/virtio/vhost-vsock-common.h" +#include "hw/virtio/vhost-user.h" +#include "standard-headers/linux/virtio_vsock.h" + +#define TYPE_VHOST_USER_VSOCK "vhost-user-vsock-device" +#define VHOST_USER_VSOCK(obj) \ + OBJECT_CHECK(VHostUserVSock, (obj), TYPE_VHOST_USER_VSOCK) + +typedef struct { + CharBackend chardev; +} VHostUserVSockConf; + +typedef struct { + /*< private >*/ + VHostVSockCommon parent; + VhostUserState vhost_user; + VHostUserVSockConf conf; + struct virtio_vsock_config vsockcfg; + + /*< public >*/ +} VHostUserVSock; + +#endif /* _QEMU_VHOST_USER_VSOCK_H */ From 9b83bb27477f3fb2c08db32ea2ee9e8f817f0f9d Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 22 May 2020 14:25:12 +0200 Subject: [PATCH 50/58] virtio: add vhost-user-vsock-pci device Add the PCI version of vhost-user-vsock Launch QEMU like this: qemu -chardev socket,path=/tmp/vm.vsock,id=chr0 \ -device vhost-user-vsock-pci,chardev=chr0 Signed-off-by: Stefano Garzarella Message-Id: <20200522122512.87413-4-sgarzare@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/Makefile.objs | 1 + hw/virtio/vhost-user-vsock-pci.c | 84 ++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 hw/virtio/vhost-user-vsock-pci.c diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs index dd42daedb1..13e75f171f 100644 --- a/hw/virtio/Makefile.objs +++ b/hw/virtio/Makefile.objs @@ -22,6 +22,7 @@ obj-$(CONFIG_VHOST_USER_VSOCK) += vhost-vsock-common.o vhost-user-vsock.o ifeq ($(CONFIG_VIRTIO_PCI),y) obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-pci.o +obj-$(CONFIG_VHOST_USER_VSOCK) += vhost-user-vsock-pci.o obj-$(CONFIG_VHOST_USER_BLK) += vhost-user-blk-pci.o obj-$(CONFIG_VHOST_USER_INPUT) += vhost-user-input-pci.o obj-$(CONFIG_VHOST_USER_SCSI) += vhost-user-scsi-pci.o diff --git a/hw/virtio/vhost-user-vsock-pci.c b/hw/virtio/vhost-user-vsock-pci.c new file mode 100644 index 0000000000..0a6847e6fc --- /dev/null +++ b/hw/virtio/vhost-user-vsock-pci.c @@ -0,0 +1,84 @@ +/* + * Vhost-user vsock PCI Bindings + * + * Copyright 2020 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" + +#include "virtio-pci.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost-user-vsock.h" + +typedef struct VHostUserVSockPCI VHostUserVSockPCI; + +/* + * vhost-user-vsock-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VHOST_USER_VSOCK_PCI "vhost-user-vsock-pci-base" +#define VHOST_USER_VSOCK_PCI(obj) \ + OBJECT_CHECK(VHostUserVSockPCI, (obj), TYPE_VHOST_USER_VSOCK_PCI) + +struct VHostUserVSockPCI { + VirtIOPCIProxy parent_obj; + VHostUserVSock vdev; +}; + +/* vhost-user-vsock-pci */ + +static Property vhost_user_vsock_pci_properties[] = { + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vhost_user_vsock_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VHostUserVSockPCI *dev = VHOST_USER_VSOCK_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); + object_property_set_bool(OBJECT(vdev), true, "realized", errp); +} + +static void vhost_user_vsock_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = vhost_user_vsock_pci_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + device_class_set_props(dc, vhost_user_vsock_pci_properties); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_VSOCK; + pcidev_k->revision = 0x00; + pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER; +} + +static void vhost_user_vsock_pci_instance_init(Object *obj) +{ + VHostUserVSockPCI *dev = VHOST_USER_VSOCK_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VHOST_USER_VSOCK); +} + +static const VirtioPCIDeviceTypeInfo vhost_user_vsock_pci_info = { + .base_name = TYPE_VHOST_USER_VSOCK_PCI, + .generic_name = "vhost-user-vsock-pci", + .transitional_name = "vhost-user-vsock-pci-transitional", + .non_transitional_name = "vhost-user-vsock-pci-non-transitional", + .instance_size = sizeof(VHostUserVSockPCI), + .instance_init = vhost_user_vsock_pci_instance_init, + .class_init = vhost_user_vsock_pci_class_init, +}; + +static void virtio_pci_vhost_register(void) +{ + virtio_pci_types_register(&vhost_user_vsock_pci_info); +} + +type_init(virtio_pci_vhost_register) From 1dc32f9aeb127d995efd52a29f57460c011780da Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Wed, 20 May 2020 15:19:46 +0200 Subject: [PATCH 51/58] acpi: make build_madt() more generic. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove PCMachineState dependency from build_madt(). Pass AcpiDeviceIf as separate argument instead of depending on PCMachineState->acpi_dev. Signed-off-by: Gerd Hoffmann Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Igor Mammedov Message-Id: <20200520132003.9492-6-kraxel@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-build.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 1ecb68f45f..d217fc1fe6 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -366,14 +366,13 @@ void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid, } static void -build_madt(GArray *table_data, BIOSLinker *linker, PCMachineState *pcms) +build_madt(GArray *table_data, BIOSLinker *linker, + X86MachineState *x86ms, AcpiDeviceIf *adev) { - MachineClass *mc = MACHINE_GET_CLASS(pcms); - X86MachineState *x86ms = X86_MACHINE(pcms); - const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(pcms)); + MachineClass *mc = MACHINE_GET_CLASS(x86ms); + const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(x86ms)); int madt_start = table_data->len; - AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(pcms->acpi_dev); - AcpiDeviceIf *adev = ACPI_DEVICE_IF(pcms->acpi_dev); + AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(adev); bool x2apic_mode = false; AcpiMultipleApicTable *madt; @@ -2708,7 +2707,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) aml_len += tables_blob->len - fadt; acpi_add_table(table_offsets, tables_blob); - build_madt(tables_blob, tables->linker, pcms); + build_madt(tables_blob, tables->linker, x86ms, + ACPI_DEVICE_IF(pcms->acpi_dev)); vmgenid_dev = find_vmgenid_dev(); if (vmgenid_dev) { From eb66ffabc00a7b062b2c73b9178eb2679328813b Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Wed, 20 May 2020 15:19:47 +0200 Subject: [PATCH 52/58] acpi: create acpi-common.c and move madt code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We'll need madt support for microvm. Signed-off-by: Gerd Hoffmann Reviewed-by: Igor Mammedov Message-Id: <20200520132003.9492-7-kraxel@redhat.com> Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/Makefile.objs | 1 + hw/i386/acpi-build.c | 126 +--------------------------------- hw/i386/acpi-common.c | 152 ++++++++++++++++++++++++++++++++++++++++++ hw/i386/acpi-common.h | 14 ++++ 4 files changed, 170 insertions(+), 123 deletions(-) create mode 100644 hw/i386/acpi-common.c create mode 100644 hw/i386/acpi-common.h diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs index 8ce1b26533..6abc74551a 100644 --- a/hw/i386/Makefile.objs +++ b/hw/i386/Makefile.objs @@ -16,4 +16,5 @@ obj-$(CONFIG_VMMOUSE) += vmmouse.o obj-$(CONFIG_PC) += port92.o obj-y += kvmvapic.o +obj-$(CONFIG_ACPI) += acpi-common.o obj-$(CONFIG_PC) += acpi-build.o diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index d217fc1fe6..26c0c8aefa 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -24,6 +24,7 @@ #include "qapi/error.h" #include "qapi/qmp/qnum.h" #include "acpi-build.h" +#include "acpi-common.h" #include "qemu/bitmap.h" #include "qemu/error-report.h" #include "hw/pci/pci.h" @@ -89,9 +90,6 @@ #define ACPI_BUILD_DPRINTF(fmt, ...) #endif -/* Default IOAPIC ID */ -#define ACPI_BUILD_IOAPIC_ID 0x0 - typedef struct AcpiPmInfo { bool s3_disabled; bool s4_disabled; @@ -327,124 +325,6 @@ build_facs(GArray *table_data) facs->length = cpu_to_le32(sizeof(*facs)); } -void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid, - const CPUArchIdList *apic_ids, GArray *entry) -{ - uint32_t apic_id = apic_ids->cpus[uid].arch_id; - - /* ACPI spec says that LAPIC entry for non present - * CPU may be omitted from MADT or it must be marked - * as disabled. However omitting non present CPU from - * MADT breaks hotplug on linux. So possible CPUs - * should be put in MADT but kept disabled. - */ - if (apic_id < 255) { - AcpiMadtProcessorApic *apic = acpi_data_push(entry, sizeof *apic); - - apic->type = ACPI_APIC_PROCESSOR; - apic->length = sizeof(*apic); - apic->processor_id = uid; - apic->local_apic_id = apic_id; - if (apic_ids->cpus[uid].cpu != NULL) { - apic->flags = cpu_to_le32(1); - } else { - apic->flags = cpu_to_le32(0); - } - } else { - AcpiMadtProcessorX2Apic *apic = acpi_data_push(entry, sizeof *apic); - - apic->type = ACPI_APIC_LOCAL_X2APIC; - apic->length = sizeof(*apic); - apic->uid = cpu_to_le32(uid); - apic->x2apic_id = cpu_to_le32(apic_id); - if (apic_ids->cpus[uid].cpu != NULL) { - apic->flags = cpu_to_le32(1); - } else { - apic->flags = cpu_to_le32(0); - } - } -} - -static void -build_madt(GArray *table_data, BIOSLinker *linker, - X86MachineState *x86ms, AcpiDeviceIf *adev) -{ - MachineClass *mc = MACHINE_GET_CLASS(x86ms); - const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(x86ms)); - int madt_start = table_data->len; - AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(adev); - bool x2apic_mode = false; - - AcpiMultipleApicTable *madt; - AcpiMadtIoApic *io_apic; - AcpiMadtIntsrcovr *intsrcovr; - int i; - - madt = acpi_data_push(table_data, sizeof *madt); - madt->local_apic_address = cpu_to_le32(APIC_DEFAULT_ADDRESS); - madt->flags = cpu_to_le32(1); - - for (i = 0; i < apic_ids->len; i++) { - adevc->madt_cpu(adev, i, apic_ids, table_data); - if (apic_ids->cpus[i].arch_id > 254) { - x2apic_mode = true; - } - } - - io_apic = acpi_data_push(table_data, sizeof *io_apic); - io_apic->type = ACPI_APIC_IO; - io_apic->length = sizeof(*io_apic); - io_apic->io_apic_id = ACPI_BUILD_IOAPIC_ID; - io_apic->address = cpu_to_le32(IO_APIC_DEFAULT_ADDRESS); - io_apic->interrupt = cpu_to_le32(0); - - if (x86ms->apic_xrupt_override) { - intsrcovr = acpi_data_push(table_data, sizeof *intsrcovr); - intsrcovr->type = ACPI_APIC_XRUPT_OVERRIDE; - intsrcovr->length = sizeof(*intsrcovr); - intsrcovr->source = 0; - intsrcovr->gsi = cpu_to_le32(2); - intsrcovr->flags = cpu_to_le16(0); /* conforms to bus specifications */ - } - for (i = 1; i < 16; i++) { -#define ACPI_BUILD_PCI_IRQS ((1<<5) | (1<<9) | (1<<10) | (1<<11)) - if (!(ACPI_BUILD_PCI_IRQS & (1 << i))) { - /* No need for a INT source override structure. */ - continue; - } - intsrcovr = acpi_data_push(table_data, sizeof *intsrcovr); - intsrcovr->type = ACPI_APIC_XRUPT_OVERRIDE; - intsrcovr->length = sizeof(*intsrcovr); - intsrcovr->source = i; - intsrcovr->gsi = cpu_to_le32(i); - intsrcovr->flags = cpu_to_le16(0xd); /* active high, level triggered */ - } - - if (x2apic_mode) { - AcpiMadtLocalX2ApicNmi *local_nmi; - - local_nmi = acpi_data_push(table_data, sizeof *local_nmi); - local_nmi->type = ACPI_APIC_LOCAL_X2APIC_NMI; - local_nmi->length = sizeof(*local_nmi); - local_nmi->uid = 0xFFFFFFFF; /* all processors */ - local_nmi->flags = cpu_to_le16(0); - local_nmi->lint = 1; /* ACPI_LINT1 */ - } else { - AcpiMadtLocalNmi *local_nmi; - - local_nmi = acpi_data_push(table_data, sizeof *local_nmi); - local_nmi->type = ACPI_APIC_LOCAL_NMI; - local_nmi->length = sizeof(*local_nmi); - local_nmi->processor_id = 0xff; /* all processors */ - local_nmi->flags = cpu_to_le16(0); - local_nmi->lint = 1; /* ACPI_LINT1 */ - } - - build_header(linker, table_data, - (void *)(table_data->data + madt_start), "APIC", - table_data->len - madt_start, 1, NULL, NULL); -} - static void build_append_pcihp_notify_entry(Aml *method, int slot) { Aml *if_ctx; @@ -2707,8 +2587,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) aml_len += tables_blob->len - fadt; acpi_add_table(table_offsets, tables_blob); - build_madt(tables_blob, tables->linker, x86ms, - ACPI_DEVICE_IF(pcms->acpi_dev)); + acpi_build_madt(tables_blob, tables->linker, x86ms, + ACPI_DEVICE_IF(pcms->acpi_dev)); vmgenid_dev = find_vmgenid_dev(); if (vmgenid_dev) { diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c new file mode 100644 index 0000000000..5caca16a0b --- /dev/null +++ b/hw/i386/acpi-common.c @@ -0,0 +1,152 @@ +/* Support for generating ACPI tables and passing them to Guests + * + * Copyright (C) 2008-2010 Kevin O'Connor + * Copyright (C) 2006 Fabrice Bellard + * Copyright (C) 2013 Red Hat Inc + * + * Author: Michael S. Tsirkin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see . + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" + +#include "exec/memory.h" +#include "hw/acpi/acpi.h" +#include "hw/acpi/aml-build.h" +#include "hw/acpi/utils.h" +#include "hw/i386/pc.h" +#include "target/i386/cpu.h" + +#include "acpi-build.h" +#include "acpi-common.h" + +void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid, + const CPUArchIdList *apic_ids, GArray *entry) +{ + uint32_t apic_id = apic_ids->cpus[uid].arch_id; + + /* ACPI spec says that LAPIC entry for non present + * CPU may be omitted from MADT or it must be marked + * as disabled. However omitting non present CPU from + * MADT breaks hotplug on linux. So possible CPUs + * should be put in MADT but kept disabled. + */ + if (apic_id < 255) { + AcpiMadtProcessorApic *apic = acpi_data_push(entry, sizeof *apic); + + apic->type = ACPI_APIC_PROCESSOR; + apic->length = sizeof(*apic); + apic->processor_id = uid; + apic->local_apic_id = apic_id; + if (apic_ids->cpus[uid].cpu != NULL) { + apic->flags = cpu_to_le32(1); + } else { + apic->flags = cpu_to_le32(0); + } + } else { + AcpiMadtProcessorX2Apic *apic = acpi_data_push(entry, sizeof *apic); + + apic->type = ACPI_APIC_LOCAL_X2APIC; + apic->length = sizeof(*apic); + apic->uid = cpu_to_le32(uid); + apic->x2apic_id = cpu_to_le32(apic_id); + if (apic_ids->cpus[uid].cpu != NULL) { + apic->flags = cpu_to_le32(1); + } else { + apic->flags = cpu_to_le32(0); + } + } +} + +void acpi_build_madt(GArray *table_data, BIOSLinker *linker, + X86MachineState *x86ms, AcpiDeviceIf *adev) +{ + MachineClass *mc = MACHINE_GET_CLASS(x86ms); + const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(x86ms)); + int madt_start = table_data->len; + AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(adev); + bool x2apic_mode = false; + + AcpiMultipleApicTable *madt; + AcpiMadtIoApic *io_apic; + AcpiMadtIntsrcovr *intsrcovr; + int i; + + madt = acpi_data_push(table_data, sizeof *madt); + madt->local_apic_address = cpu_to_le32(APIC_DEFAULT_ADDRESS); + madt->flags = cpu_to_le32(1); + + for (i = 0; i < apic_ids->len; i++) { + adevc->madt_cpu(adev, i, apic_ids, table_data); + if (apic_ids->cpus[i].arch_id > 254) { + x2apic_mode = true; + } + } + + io_apic = acpi_data_push(table_data, sizeof *io_apic); + io_apic->type = ACPI_APIC_IO; + io_apic->length = sizeof(*io_apic); + io_apic->io_apic_id = ACPI_BUILD_IOAPIC_ID; + io_apic->address = cpu_to_le32(IO_APIC_DEFAULT_ADDRESS); + io_apic->interrupt = cpu_to_le32(0); + + if (x86ms->apic_xrupt_override) { + intsrcovr = acpi_data_push(table_data, sizeof *intsrcovr); + intsrcovr->type = ACPI_APIC_XRUPT_OVERRIDE; + intsrcovr->length = sizeof(*intsrcovr); + intsrcovr->source = 0; + intsrcovr->gsi = cpu_to_le32(2); + intsrcovr->flags = cpu_to_le16(0); /* conforms to bus specifications */ + } + for (i = 1; i < 16; i++) { +#define ACPI_BUILD_PCI_IRQS ((1<<5) | (1<<9) | (1<<10) | (1<<11)) + if (!(ACPI_BUILD_PCI_IRQS & (1 << i))) { + /* No need for a INT source override structure. */ + continue; + } + intsrcovr = acpi_data_push(table_data, sizeof *intsrcovr); + intsrcovr->type = ACPI_APIC_XRUPT_OVERRIDE; + intsrcovr->length = sizeof(*intsrcovr); + intsrcovr->source = i; + intsrcovr->gsi = cpu_to_le32(i); + intsrcovr->flags = cpu_to_le16(0xd); /* active high, level triggered */ + } + + if (x2apic_mode) { + AcpiMadtLocalX2ApicNmi *local_nmi; + + local_nmi = acpi_data_push(table_data, sizeof *local_nmi); + local_nmi->type = ACPI_APIC_LOCAL_X2APIC_NMI; + local_nmi->length = sizeof(*local_nmi); + local_nmi->uid = 0xFFFFFFFF; /* all processors */ + local_nmi->flags = cpu_to_le16(0); + local_nmi->lint = 1; /* ACPI_LINT1 */ + } else { + AcpiMadtLocalNmi *local_nmi; + + local_nmi = acpi_data_push(table_data, sizeof *local_nmi); + local_nmi->type = ACPI_APIC_LOCAL_NMI; + local_nmi->length = sizeof(*local_nmi); + local_nmi->processor_id = 0xff; /* all processors */ + local_nmi->flags = cpu_to_le16(0); + local_nmi->lint = 1; /* ACPI_LINT1 */ + } + + build_header(linker, table_data, + (void *)(table_data->data + madt_start), "APIC", + table_data->len - madt_start, 1, NULL, NULL); +} + diff --git a/hw/i386/acpi-common.h b/hw/i386/acpi-common.h new file mode 100644 index 0000000000..c30e461f18 --- /dev/null +++ b/hw/i386/acpi-common.h @@ -0,0 +1,14 @@ +#ifndef HW_I386_ACPI_COMMON_H +#define HW_I386_ACPI_COMMON_H +#include "include/hw/acpi/acpi_dev_interface.h" + +#include "include/hw/acpi/bios-linker-loader.h" +#include "include/hw/i386/x86.h" + +/* Default IOAPIC ID */ +#define ACPI_BUILD_IOAPIC_ID 0x0 + +void acpi_build_madt(GArray *table_data, BIOSLinker *linker, + X86MachineState *x86ms, AcpiDeviceIf *adev); + +#endif From 5794d34a13d11e12d9bc53ea4ade997c2ff81c4c Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Wed, 20 May 2020 15:19:48 +0200 Subject: [PATCH 53/58] acpi: madt: skip pci override on pci-less systems. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Needed for microvm. Signed-off-by: Gerd Hoffmann Reviewed-by: Igor Mammedov Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20200520132003.9492-8-kraxel@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-build.c | 2 +- hw/i386/acpi-common.c | 26 +++++++++++++++----------- hw/i386/acpi-common.h | 3 ++- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 26c0c8aefa..473cbdfffd 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2588,7 +2588,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) acpi_add_table(table_offsets, tables_blob); acpi_build_madt(tables_blob, tables->linker, x86ms, - ACPI_DEVICE_IF(pcms->acpi_dev)); + ACPI_DEVICE_IF(pcms->acpi_dev), true); vmgenid_dev = find_vmgenid_dev(); if (vmgenid_dev) { diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c index 5caca16a0b..ab9b00581a 100644 --- a/hw/i386/acpi-common.c +++ b/hw/i386/acpi-common.c @@ -72,7 +72,8 @@ void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid, } void acpi_build_madt(GArray *table_data, BIOSLinker *linker, - X86MachineState *x86ms, AcpiDeviceIf *adev) + X86MachineState *x86ms, AcpiDeviceIf *adev, + bool has_pci) { MachineClass *mc = MACHINE_GET_CLASS(x86ms); const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(x86ms)); @@ -111,18 +112,21 @@ void acpi_build_madt(GArray *table_data, BIOSLinker *linker, intsrcovr->gsi = cpu_to_le32(2); intsrcovr->flags = cpu_to_le16(0); /* conforms to bus specifications */ } - for (i = 1; i < 16; i++) { + + if (has_pci) { + for (i = 1; i < 16; i++) { #define ACPI_BUILD_PCI_IRQS ((1<<5) | (1<<9) | (1<<10) | (1<<11)) - if (!(ACPI_BUILD_PCI_IRQS & (1 << i))) { - /* No need for a INT source override structure. */ - continue; + if (!(ACPI_BUILD_PCI_IRQS & (1 << i))) { + /* No need for a INT source override structure. */ + continue; + } + intsrcovr = acpi_data_push(table_data, sizeof *intsrcovr); + intsrcovr->type = ACPI_APIC_XRUPT_OVERRIDE; + intsrcovr->length = sizeof(*intsrcovr); + intsrcovr->source = i; + intsrcovr->gsi = cpu_to_le32(i); + intsrcovr->flags = cpu_to_le16(0xd); /* active high, level triggered */ } - intsrcovr = acpi_data_push(table_data, sizeof *intsrcovr); - intsrcovr->type = ACPI_APIC_XRUPT_OVERRIDE; - intsrcovr->length = sizeof(*intsrcovr); - intsrcovr->source = i; - intsrcovr->gsi = cpu_to_le32(i); - intsrcovr->flags = cpu_to_le16(0xd); /* active high, level triggered */ } if (x2apic_mode) { diff --git a/hw/i386/acpi-common.h b/hw/i386/acpi-common.h index c30e461f18..9cac18dddf 100644 --- a/hw/i386/acpi-common.h +++ b/hw/i386/acpi-common.h @@ -9,6 +9,7 @@ #define ACPI_BUILD_IOAPIC_ID 0x0 void acpi_build_madt(GArray *table_data, BIOSLinker *linker, - X86MachineState *x86ms, AcpiDeviceIf *adev); + X86MachineState *x86ms, AcpiDeviceIf *adev, + bool has_pci); #endif From c8ed8f57cc558045638512249ffc5f3633f76db1 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Wed, 20 May 2020 15:19:49 +0200 Subject: [PATCH 54/58] acpi: fadt: add hw-reduced sleep register support Add fields to struct AcpiFadtData and update build_fadt() to properly generate sleep register entries. Signed-off-by: Gerd Hoffmann Reviewed-by: Igor Mammedov Message-Id: <20200520132003.9492-9-kraxel@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/aml-build.c | 4 ++-- include/hw/acpi/acpi-defs.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index d24e9e6c3a..2cb7b991ef 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -1866,9 +1866,9 @@ void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f, } /* SLEEP_CONTROL_REG */ - build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0); + build_append_gas_from_struct(tbl, &f->sleep_ctl); /* SLEEP_STATUS_REG */ - build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0); + build_append_gas_from_struct(tbl, &f->sleep_sts); /* TODO: extra fields need to be added to support revisions above rev5 */ assert(f->rev == 5); diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h index c13327fa78..3be9ab5049 100644 --- a/include/hw/acpi/acpi-defs.h +++ b/include/hw/acpi/acpi-defs.h @@ -88,6 +88,8 @@ typedef struct AcpiFadtData { struct AcpiGenericAddress pm_tmr; /* PM_TMR_BLK */ struct AcpiGenericAddress gpe0_blk; /* GPE0_BLK */ struct AcpiGenericAddress reset_reg; /* RESET_REG */ + struct AcpiGenericAddress sleep_ctl; /* SLEEP_CONTROL_REG */ + struct AcpiGenericAddress sleep_sts; /* SLEEP_STATUS_REG */ uint8_t reset_val; /* RESET_VALUE */ uint8_t rev; /* Revision */ uint32_t flags; /* Flags */ From 32905fc95c0a082f4cb28b8076e0160b3c62c7f8 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Wed, 20 May 2020 15:19:50 +0200 Subject: [PATCH 55/58] acpi: ged: rename event memory region MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename memory region and callbacks and ops to carry "evt" in the name because a second region will be added shortly. Signed-off-by: Gerd Hoffmann Message-Id: <20200520132003.9492-10-kraxel@redhat.com> Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Igor Mammedow Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/generic_event_device.c | 16 ++++++++-------- include/hw/acpi/generic_event_device.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index b1cbdd86b6..1cb34111e5 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -142,7 +142,7 @@ void build_ged_aml(Aml *table, const char *name, HotplugHandler *hotplug_dev, } /* Memory read by the GED _EVT AML dynamic method */ -static uint64_t ged_read(void *opaque, hwaddr addr, unsigned size) +static uint64_t ged_evt_read(void *opaque, hwaddr addr, unsigned size) { uint64_t val = 0; GEDState *ged_st = opaque; @@ -161,14 +161,14 @@ static uint64_t ged_read(void *opaque, hwaddr addr, unsigned size) } /* Nothing is expected to be written to the GED memory region */ -static void ged_write(void *opaque, hwaddr addr, uint64_t data, - unsigned int size) +static void ged_evt_write(void *opaque, hwaddr addr, uint64_t data, + unsigned int size) { } -static const MemoryRegionOps ged_ops = { - .read = ged_read, - .write = ged_write, +static const MemoryRegionOps ged_evt_ops = { + .read = ged_evt_read, + .write = ged_evt_write, .endianness = DEVICE_LITTLE_ENDIAN, .valid = { .min_access_size = 4, @@ -287,9 +287,9 @@ static void acpi_ged_initfn(Object *obj) SysBusDevice *sbd = SYS_BUS_DEVICE(obj); GEDState *ged_st = &s->ged_state; - memory_region_init_io(&ged_st->io, obj, &ged_ops, ged_st, + memory_region_init_io(&ged_st->evt, obj, &ged_evt_ops, ged_st, TYPE_ACPI_GED, ACPI_GED_EVT_SEL_LEN); - sysbus_init_mmio(sbd, &ged_st->io); + sysbus_init_mmio(sbd, &ged_st->evt); sysbus_init_irq(sbd, &s->irq); diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h index 83917de024..90a9180db5 100644 --- a/include/hw/acpi/generic_event_device.h +++ b/include/hw/acpi/generic_event_device.h @@ -86,7 +86,7 @@ #define ACPI_GED_NVDIMM_HOTPLUG_EVT 0x4 typedef struct GEDState { - MemoryRegion io; + MemoryRegion evt; uint32_t sel; } GEDState; From 62925fd2b8b778539607b321216cd54897ddb63a Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Thu, 7 May 2020 17:37:58 -0400 Subject: [PATCH 56/58] Fix parameter type in vhost migration log path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ‘enable’ parameter to the vhost_migration_log() function is given as an int, but "true"/"false" values are passed in wherever it is invoked. Inside the function itself it is only ever compared with bool values. Therefore the parameter value itself should be changed to bool. Signed-off-by: Raphael Norwitz Message-Id: Reviewed-by: Eric Blake Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index aff98a0ede..aa06a36919 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -809,12 +809,12 @@ err_features: return r; } -static int vhost_migration_log(MemoryListener *listener, int enable) +static int vhost_migration_log(MemoryListener *listener, bool enable) { struct vhost_dev *dev = container_of(listener, struct vhost_dev, memory_listener); int r; - if (!!enable == dev->log_enabled) { + if (enable == dev->log_enabled) { return 0; } if (!dev->started) { From 12fcf49c1a016cc05638b965f4e7b909794bffa2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 17 Mar 2020 15:59:08 -0400 Subject: [PATCH 57/58] pci: Display PCI IRQ pin in "info pci" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sometimes it would be good to be able to read the pin number along with the IRQ number allocated. Since we'll dump the IRQ number, no reason to not dump the pin information. For example, the vfio-pci device will overwrite the pin with the hardware pin number. It would be nice to know the pin number of one assigned device from QMP/HMP. CC: Dr. David Alan Gilbert CC: Alex Williamson CC: Michael S. Tsirkin CC: Marcel Apfelbaum CC: Julia Suvorova CC: Markus Armbruster Signed-off-by: Peter Xu Message-Id: <20200317195908.283800-1-peterx@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Philippe Mathieu-Daudé Acked-by: Dr. David Alan Gilbert Acked-by: Markus Armbruster --- hw/pci/pci.c | 1 + monitor/hmp-cmds.c | 3 ++- qapi/misc.json | 6 ++++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 1b88a32cf7..a60cf3ae3b 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -1776,6 +1776,7 @@ static PciDeviceInfo *qmp_query_pci_device(PCIDevice *dev, PCIBus *bus, info->regions = qmp_query_pci_regions(dev); info->qdev_id = g_strdup(dev->qdev.id ? dev->qdev.id : ""); + info->irq_pin = dev->config[PCI_INTERRUPT_PIN]; if (dev->config[PCI_INTERRUPT_PIN] != 0) { info->has_irq = true; info->irq = dev->config[PCI_INTERRUPT_LINE]; diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index 9c61e769ca..e03adf0d4d 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -688,7 +688,8 @@ static void hmp_info_pci_device(Monitor *mon, const PciDeviceInfo *dev) } if (dev->has_irq) { - monitor_printf(mon, " IRQ %" PRId64 ".\n", dev->irq); + monitor_printf(mon, " IRQ %" PRId64 ", pin %c\n", + dev->irq, (char)('A' + dev->irq_pin - 1)); } if (dev->has_pci_bridge) { diff --git a/qapi/misc.json b/qapi/misc.json index 99b90ac80b..a5a0beb902 100644 --- a/qapi/misc.json +++ b/qapi/misc.json @@ -403,6 +403,8 @@ # # @irq: if an IRQ is assigned to the device, the IRQ number # +# @irq_pin: the IRQ pin, zero means no IRQ (since 5.1) +# # @qdev_id: the device name of the PCI device # # @pci_bridge: if the device is a PCI bridge, the bridge information @@ -417,8 +419,8 @@ { 'struct': 'PciDeviceInfo', 'data': {'bus': 'int', 'slot': 'int', 'function': 'int', 'class_info': 'PciDeviceClass', 'id': 'PciDeviceId', - '*irq': 'int', 'qdev_id': 'str', '*pci_bridge': 'PciBridgeInfo', - 'regions': ['PciMemoryRegion']} } + '*irq': 'int', 'irq_pin': 'int', 'qdev_id': 'str', + '*pci_bridge': 'PciBridgeInfo', 'regions': ['PciMemoryRegion'] }} ## # @PciInfo: From 10d35e581901c09ee3817ac7cddd296d05291a9d Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 10 Jun 2020 13:43:51 +0800 Subject: [PATCH 58/58] virtio-pci: fix queue_enable write Spec said: The driver uses this to selectively prevent the device from executing requests from this virtqueue. 1 - enabled; 0 - disabled. Though write 0 to queue_enable is forbidden by the spec, we should not assume that the value is 1. Fix this by ignore the write value other than 1. Signed-off-by: Jason Wang Message-Id: <20200610054351.15811-1-jasowang@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella Reviewed-by: Stefan Hajnoczi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-pci.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index d028c17c24..7bc8c1c056 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1273,16 +1273,20 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, virtio_queue_set_vector(vdev, vdev->queue_sel, val); break; case VIRTIO_PCI_COMMON_Q_ENABLE: - virtio_queue_set_num(vdev, vdev->queue_sel, - proxy->vqs[vdev->queue_sel].num); - virtio_queue_set_rings(vdev, vdev->queue_sel, + if (val == 1) { + virtio_queue_set_num(vdev, vdev->queue_sel, + proxy->vqs[vdev->queue_sel].num); + virtio_queue_set_rings(vdev, vdev->queue_sel, ((uint64_t)proxy->vqs[vdev->queue_sel].desc[1]) << 32 | proxy->vqs[vdev->queue_sel].desc[0], ((uint64_t)proxy->vqs[vdev->queue_sel].avail[1]) << 32 | proxy->vqs[vdev->queue_sel].avail[0], ((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 | proxy->vqs[vdev->queue_sel].used[0]); - proxy->vqs[vdev->queue_sel].enabled = 1; + proxy->vqs[vdev->queue_sel].enabled = 1; + } else { + virtio_error(vdev, "wrong value for queue_enable %"PRIx64, val); + } break; case VIRTIO_PCI_COMMON_Q_DESCLO: proxy->vqs[vdev->queue_sel].desc[0] = val;