From 7e26c92ba816fccc40ab0a7c1364b65f226e2b21 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 25 Aug 2020 20:28:51 +1000 Subject: [PATCH 01/33] adb: Correct class size on TYPE_ADB_DEVICE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TypeInfo incorrectly just lets the class size be inherited. It won't actually break things, since the class is abstract, but we should get it right. Signed-off-by: David Gibson Reviewed-by: Cédric Le Goater Reviewed-by: Philippe Mathieu-Daudé --- hw/input/adb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/input/adb.c b/hw/input/adb.c index 013fcc9c54..84331b9fce 100644 --- a/hw/input/adb.c +++ b/hw/input/adb.c @@ -309,6 +309,7 @@ static void adb_device_class_init(ObjectClass *oc, void *data) static const TypeInfo adb_device_type_info = { .name = TYPE_ADB_DEVICE, .parent = TYPE_DEVICE, + .class_size = sizeof(ADBDeviceClass), .instance_size = sizeof(ADBDevice), .abstract = true, .class_init = adb_device_class_init, From 021e878f2efaa4fdcb5e11abfa224f5ba813f809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Sat, 22 Aug 2020 10:39:20 +0200 Subject: [PATCH 02/33] ppc/pnv: Fix TypeInfo of PnvLpcController abstract class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It was missing the instance_size field. Cc: Eduardo Habkost Signed-off-by: Cédric Le Goater Message-Id: <20200822083920.2668930-1-clg@kaod.org> Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/ppc/pnv_lpc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/ppc/pnv_lpc.c b/hw/ppc/pnv_lpc.c index b5ffa48dac..23f1e09492 100644 --- a/hw/ppc/pnv_lpc.c +++ b/hw/ppc/pnv_lpc.c @@ -646,7 +646,6 @@ static void pnv_lpc_power8_class_init(ObjectClass *klass, void *data) static const TypeInfo pnv_lpc_power8_info = { .name = TYPE_PNV8_LPC, .parent = TYPE_PNV_LPC, - .instance_size = sizeof(PnvLpcController), .class_init = pnv_lpc_power8_class_init, .interfaces = (InterfaceInfo[]) { { TYPE_PNV_XSCOM_INTERFACE }, @@ -687,7 +686,6 @@ static void pnv_lpc_power9_class_init(ObjectClass *klass, void *data) static const TypeInfo pnv_lpc_power9_info = { .name = TYPE_PNV9_LPC, .parent = TYPE_PNV_LPC, - .instance_size = sizeof(PnvLpcController), .class_init = pnv_lpc_power9_class_init, }; @@ -768,6 +766,7 @@ static void pnv_lpc_class_init(ObjectClass *klass, void *data) static const TypeInfo pnv_lpc_info = { .name = TYPE_PNV_LPC, .parent = TYPE_DEVICE, + .instance_size = sizeof(PnvLpcController), .class_init = pnv_lpc_class_init, .class_size = sizeof(PnvLpcClass), .abstract = true, From 98b49b2bea15ed799e6876336c05de030a040586 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 25 Aug 2020 20:51:11 +1000 Subject: [PATCH 03/33] spapr: Remove unnecessary DRC type-checker macros spapr_drc.h includes typechecker macro boilerplate for the many different DRC subclasses. However, most of these types don't actually have different data in their class and/or instance, making these unneeded, unused, and in fact a bad idea. Remove them. Signed-off-by: David Gibson Reviewed-by: Greg Kurz --- include/hw/ppc/spapr_drc.h | 43 +------------------------------------- 1 file changed, 1 insertion(+), 42 deletions(-) diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h index 21af8deac1..f270860769 100644 --- a/include/hw/ppc/spapr_drc.h +++ b/include/hw/ppc/spapr_drc.h @@ -29,62 +29,21 @@ TYPE_SPAPR_DR_CONNECTOR) #define TYPE_SPAPR_DRC_PHYSICAL "spapr-drc-physical" -#define SPAPR_DRC_PHYSICAL_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_PHYSICAL) -#define SPAPR_DRC_PHYSICAL_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, \ - TYPE_SPAPR_DRC_PHYSICAL) #define SPAPR_DRC_PHYSICAL(obj) OBJECT_CHECK(SpaprDrcPhysical, (obj), \ TYPE_SPAPR_DRC_PHYSICAL) #define TYPE_SPAPR_DRC_LOGICAL "spapr-drc-logical" -#define SPAPR_DRC_LOGICAL_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_LOGICAL) -#define SPAPR_DRC_LOGICAL_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, \ - TYPE_SPAPR_DRC_LOGICAL) -#define SPAPR_DRC_LOGICAL(obj) OBJECT_CHECK(SpaprDrc, (obj), \ - TYPE_SPAPR_DRC_LOGICAL) #define TYPE_SPAPR_DRC_CPU "spapr-drc-cpu" -#define SPAPR_DRC_CPU_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_CPU) -#define SPAPR_DRC_CPU_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, TYPE_SPAPR_DRC_CPU) -#define SPAPR_DRC_CPU(obj) OBJECT_CHECK(SpaprDrc, (obj), \ - TYPE_SPAPR_DRC_CPU) #define TYPE_SPAPR_DRC_PCI "spapr-drc-pci" -#define SPAPR_DRC_PCI_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_PCI) -#define SPAPR_DRC_PCI_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, TYPE_SPAPR_DRC_PCI) -#define SPAPR_DRC_PCI(obj) OBJECT_CHECK(SpaprDrc, (obj), \ - TYPE_SPAPR_DRC_PCI) #define TYPE_SPAPR_DRC_LMB "spapr-drc-lmb" -#define SPAPR_DRC_LMB_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_LMB) -#define SPAPR_DRC_LMB_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, TYPE_SPAPR_DRC_LMB) -#define SPAPR_DRC_LMB(obj) OBJECT_CHECK(SpaprDrc, (obj), \ - TYPE_SPAPR_DRC_LMB) #define TYPE_SPAPR_DRC_PHB "spapr-drc-phb" -#define SPAPR_DRC_PHB_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_PHB) -#define SPAPR_DRC_PHB_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, TYPE_SPAPR_DRC_PHB) -#define SPAPR_DRC_PHB(obj) OBJECT_CHECK(SpaprDrc, (obj), \ - TYPE_SPAPR_DRC_PHB) #define TYPE_SPAPR_DRC_PMEM "spapr-drc-pmem" -#define SPAPR_DRC_PMEM_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_PMEM) -#define SPAPR_DRC_PMEM_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, TYPE_SPAPR_DRC_PMEM) -#define SPAPR_DRC_PMEM(obj) OBJECT_CHECK(SpaprDrc, (obj), \ - TYPE_SPAPR_DRC_PMEM) + /* * Various hotplug types managed by SpaprDrc * From 4f311a70893800f33b57852ca4453823271c022b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 19 Aug 2020 15:08:36 +0200 Subject: [PATCH 04/33] spapr/xive: Add a 'hv-prio' property to represent the KVM escalation priority MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On POWER9, the KVM XIVE device uses priority 7 for the escalation interrupts. On POWER10, the host can use a reduced set of priorities and KVM will configure the escalation priority to a lower number. In any case, the guest is allowed to use priorities in a single range : [ 0 .. (maxprio - 1) ]. Introduce a 'hv-prio' property to represent the escalation priority number and use it to compute the "ibm,plat-res-int-priorities" property defining the priority ranges reserved by the hypervisor. Signed-off-by: Cédric Le Goater Message-Id: <20200819130843.2230799-2-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive.c | 33 ++++++++++++++------------------- include/hw/ppc/spapr_xive.h | 2 ++ 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index 4bd0d606ba..1fa09f287a 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -595,6 +595,7 @@ static Property spapr_xive_properties[] = { DEFINE_PROP_UINT32("nr-ends", SpaprXive, nr_ends, 0), DEFINE_PROP_UINT64("vc-base", SpaprXive, vc_base, SPAPR_XIVE_VC_BASE), DEFINE_PROP_UINT64("tm-base", SpaprXive, tm_base, SPAPR_XIVE_TM_BASE), + DEFINE_PROP_UINT8("hv-prio", SpaprXive, hv_prio, 7), DEFINE_PROP_END_OF_LIST(), }; @@ -692,12 +693,13 @@ static void spapr_xive_dt(SpaprInterruptController *intc, uint32_t nr_servers, cpu_to_be32(16), /* 64K */ }; /* - * The following array is in sync with the reserved priorities - * defined by the 'spapr_xive_priority_is_reserved' routine. + * QEMU/KVM only needs to define a single range to reserve the + * escalation priority. A priority bitmask would have been more + * appropriate. */ uint32_t plat_res_int_priorities[] = { - cpu_to_be32(7), /* start */ - cpu_to_be32(0xf8), /* count */ + cpu_to_be32(xive->hv_prio), /* start */ + cpu_to_be32(0xff - xive->hv_prio), /* count */ }; /* Thread Interrupt Management Area : User (ring 3) and OS (ring 2) */ @@ -844,19 +846,12 @@ type_init(spapr_xive_register_types) */ /* - * Linux hosts under OPAL reserve priority 7 for their own escalation - * interrupts (DD2.X POWER9). So we only allow the guest to use - * priorities [0..6]. + * On POWER9, the KVM XIVE device uses priority 7 for the escalation + * interrupts. So we only allow the guest to use priorities [0..6]. */ -static bool spapr_xive_priority_is_reserved(uint8_t priority) +static bool spapr_xive_priority_is_reserved(SpaprXive *xive, uint8_t priority) { - switch (priority) { - case 0 ... 6: - return false; - case 7: /* OPAL escalation queue */ - default: - return true; - } + return priority >= xive->hv_prio; } /* @@ -1053,7 +1048,7 @@ static target_ulong h_int_set_source_config(PowerPCCPU *cpu, new_eas.w = eas.w & cpu_to_be64(~EAS_MASKED); } - if (spapr_xive_priority_is_reserved(priority)) { + if (spapr_xive_priority_is_reserved(xive, priority)) { qemu_log_mask(LOG_GUEST_ERROR, "XIVE: priority " TARGET_FMT_ld " is reserved\n", priority); return H_P4; @@ -1212,7 +1207,7 @@ static target_ulong h_int_get_queue_info(PowerPCCPU *cpu, * This is not needed when running the emulation under QEMU */ - if (spapr_xive_priority_is_reserved(priority)) { + if (spapr_xive_priority_is_reserved(xive, priority)) { qemu_log_mask(LOG_GUEST_ERROR, "XIVE: priority " TARGET_FMT_ld " is reserved\n", priority); return H_P3; @@ -1299,7 +1294,7 @@ static target_ulong h_int_set_queue_config(PowerPCCPU *cpu, * This is not needed when running the emulation under QEMU */ - if (spapr_xive_priority_is_reserved(priority)) { + if (spapr_xive_priority_is_reserved(xive, priority)) { qemu_log_mask(LOG_GUEST_ERROR, "XIVE: priority " TARGET_FMT_ld " is reserved\n", priority); return H_P3; @@ -1466,7 +1461,7 @@ static target_ulong h_int_get_queue_config(PowerPCCPU *cpu, * This is not needed when running the emulation under QEMU */ - if (spapr_xive_priority_is_reserved(priority)) { + if (spapr_xive_priority_is_reserved(xive, priority)) { qemu_log_mask(LOG_GUEST_ERROR, "XIVE: priority " TARGET_FMT_ld " is reserved\n", priority); return H_P3; diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h index a1c8540ab4..26c8d90d71 100644 --- a/include/hw/ppc/spapr_xive.h +++ b/include/hw/ppc/spapr_xive.h @@ -49,6 +49,8 @@ typedef struct SpaprXive { void *tm_mmap; MemoryRegion tm_mmio_kvm; VMChangeStateEntry *change; + + uint8_t hv_prio; } SpaprXive; typedef struct SpaprXiveClass { From c02f9892af1f166634e1b4fd722044151acb5e88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 20 Aug 2020 18:46:38 +0200 Subject: [PATCH 05/33] ppc/pnv: Add a HIOMAP erase command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OPAL test suite runs a read-erase-write test on the PNOR : https://github.com/open-power/op-test/blob/master/testcases/OpTestPNOR.py which revealed that the IPMI HIOMAP handlers didn't support HIOMAP_C_ERASE. Implement the sector erase command by writing 0xFF in the PNOR memory region. Cc: Corey Minyard Reported-by: Klaus Heinrich Kiwi Signed-off-by: Cédric Le Goater Message-Id: <20200820164638.2515681-1-clg@kaod.org> Acked-by: Corey Minyard Signed-off-by: David Gibson --- hw/ppc/pnv_bmc.c | 29 ++++++++++++++++++++++++++++- include/hw/ipmi/ipmi.h | 1 + 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/hw/ppc/pnv_bmc.c b/hw/ppc/pnv_bmc.c index 2e1a03daa4..67ebb16c4d 100644 --- a/hw/ppc/pnv_bmc.c +++ b/hw/ppc/pnv_bmc.c @@ -140,6 +140,27 @@ static uint16_t bytes_to_blocks(uint32_t bytes) return bytes >> BLOCK_SHIFT; } +static uint32_t blocks_to_bytes(uint16_t blocks) +{ + return blocks << BLOCK_SHIFT; +} + +static int hiomap_erase(PnvPnor *pnor, uint32_t offset, uint32_t size) +{ + MemTxResult result; + int i; + + for (i = 0; i < size / 4; i++) { + result = memory_region_dispatch_write(&pnor->mmio, offset + i * 4, + 0xFFFFFFFF, MO_32, + MEMTXATTRS_UNSPECIFIED); + if (result != MEMTX_OK) { + return -1; + } + } + return 0; +} + static void hiomap_cmd(IPMIBmcSim *ibs, uint8_t *cmd, unsigned int cmd_len, RspBuffer *rsp) { @@ -155,10 +176,16 @@ static void hiomap_cmd(IPMIBmcSim *ibs, uint8_t *cmd, unsigned int cmd_len, switch (cmd[2]) { case HIOMAP_C_MARK_DIRTY: case HIOMAP_C_FLUSH: - case HIOMAP_C_ERASE: case HIOMAP_C_ACK: break; + case HIOMAP_C_ERASE: + if (hiomap_erase(pnor, blocks_to_bytes(cmd[5] << 8 | cmd[4]), + blocks_to_bytes(cmd[7] << 8 | cmd[6]))) { + rsp_buffer_set_error(rsp, IPMI_CC_UNSPECIFIED); + } + break; + case HIOMAP_C_GET_INFO: rsp_buffer_push(rsp, 2); /* Version 2 */ rsp_buffer_push(rsp, BLOCK_SHIFT); /* block size */ diff --git a/include/hw/ipmi/ipmi.h b/include/hw/ipmi/ipmi.h index 8a99d958bb..c1efdaa4cb 100644 --- a/include/hw/ipmi/ipmi.h +++ b/include/hw/ipmi/ipmi.h @@ -53,6 +53,7 @@ enum ipmi_op { #define IPMI_CC_INVALID_DATA_FIELD 0xcc #define IPMI_CC_BMC_INIT_IN_PROGRESS 0xd2 #define IPMI_CC_COMMAND_NOT_SUPPORTED 0xd5 +#define IPMI_CC_UNSPECIFIED 0xff #define IPMI_NETFN_APP 0x06 #define IPMI_NETFN_OEM 0x3a From 64dbe2c8b823a64ddda5ddef52b7d5a1ddf35d8f Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Thu, 20 Aug 2020 16:06:35 -0300 Subject: [PATCH 06/33] spapr_vscsi: do not allow device hotplug We do not implement hotplug in the vscsi bus, but we forgot to tell qdev about it. The result is that users are able to hotplug devices in the vscsi bus, the devices appear in qdev, but they aren't usable by the guest OS unless the user reboots it first. Setting qbus hotplug_handler to NULL will tell qdev-monitor, via qbus_is_hotpluggable(), that we do not support hotplug operations in spapr_vscsi. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1862059 Signed-off-by: Daniel Henrique Barboza Message-Id: <20200820190635.379657-1-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/scsi/spapr_vscsi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/scsi/spapr_vscsi.c b/hw/scsi/spapr_vscsi.c index d17dc03c73..57f0a1336f 100644 --- a/hw/scsi/spapr_vscsi.c +++ b/hw/scsi/spapr_vscsi.c @@ -1219,6 +1219,9 @@ static void spapr_vscsi_realize(SpaprVioDevice *dev, Error **errp) scsi_bus_new(&s->bus, sizeof(s->bus), DEVICE(dev), &vscsi_scsi_info, NULL); + + /* ibmvscsi SCSI bus does not allow hotplug. */ + qbus_set_hotplug_handler(BUS(&s->bus), NULL); } void spapr_vscsi_create(SpaprVioBus *bus) From b31911c6167432edc7dc5dd37e8c01f426576224 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 20 Aug 2020 16:01:06 +0200 Subject: [PATCH 07/33] spapr/xive: Use the xics flag to check for XIVE-only IRQ backends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sPAPR machine has four different IRQ backends, each implementing the XICS or XIVE interrupt mode or both in the case of the 'dual' backend. If a machine is started in P8 compat mode, QEMU should necessarily support the XICS interrupt mode and in that case, the XIVE-only IRQ backend is invalid. Currently, spapr_irq_check() tests the pointer value to the IRQ backend to check for this condition, instead use the 'xics' flag. It's equivalent and it will ease the introduction of new XIVE-only IRQ backends if needed. Signed-off-by: Cédric Le Goater Message-Id: <20200820140106.2357228-1-clg@kaod.org> Signed-off-by: David Gibson --- hw/ppc/spapr_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c index 72bb938375..f59960339e 100644 --- a/hw/ppc/spapr_irq.c +++ b/hw/ppc/spapr_irq.c @@ -172,7 +172,7 @@ static int spapr_irq_check(SpaprMachineState *spapr, Error **errp) * To cover both and not confuse the OS, add an early failure in * QEMU. */ - if (spapr->irq == &spapr_irq_xive) { + if (!spapr->irq->xics) { error_setg(errp, "XIVE-only machines require a POWER9 CPU"); return -1; } From 235d3b116213828f4206e2e4b199a32bffc96f35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 20 Aug 2020 15:45:44 +0200 Subject: [PATCH 08/33] spapr/xive: Modify kvm_cpu_is_enabled() interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We will use to check if a vCPU IPI has been created. Signed-off-by: Cédric Le Goater Message-Id: <20200820134547.2355743-2-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive_kvm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index e8667ce5f6..0e834b4b71 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -36,10 +36,9 @@ typedef struct KVMEnabledCPU { static QLIST_HEAD(, KVMEnabledCPU) kvm_enabled_cpus = QLIST_HEAD_INITIALIZER(&kvm_enabled_cpus); -static bool kvm_cpu_is_enabled(CPUState *cs) +static bool kvm_cpu_is_enabled(unsigned long vcpu_id) { KVMEnabledCPU *enabled_cpu; - unsigned long vcpu_id = kvm_arch_vcpu_id(cs); QLIST_FOREACH(enabled_cpu, &kvm_enabled_cpus, node) { if (enabled_cpu->vcpu_id == vcpu_id) { @@ -157,7 +156,7 @@ int kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp) assert(xive->fd != -1); /* Check if CPU was hot unplugged and replugged. */ - if (kvm_cpu_is_enabled(tctx->cs)) { + if (kvm_cpu_is_enabled(kvm_arch_vcpu_id(tctx->cs))) { return 0; } From fa94447a2cd6643609d5822d5b5f739dc8ad8a8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 20 Aug 2020 15:45:45 +0200 Subject: [PATCH 09/33] spapr/xive: Use kvmppc_xive_source_reset() in post_load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is doing an extra loop but should be equivalent. It also differentiate the reset of the sources from the restore of the sources configuration. This will help in allocating the vCPU IPIs independently. Signed-off-by: Cédric Le Goater Message-Id: <20200820134547.2355743-3-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive_kvm.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 0e834b4b71..3e80ea0ce9 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -646,22 +646,22 @@ int kvmppc_xive_post_load(SpaprXive *xive, int version_id) } } + /* + * We can only restore the source config if the source has been + * previously set in KVM. Since we don't do that at reset time + * when restoring a VM, let's do it now. + */ + ret = kvmppc_xive_source_reset(&xive->source, &local_err); + if (ret < 0) { + goto fail; + } + /* Restore the EAT */ for (i = 0; i < xive->nr_irqs; i++) { if (!xive_eas_is_valid(&xive->eat[i])) { continue; } - /* - * We can only restore the source config if the source has been - * previously set in KVM. Since we don't do that for all interrupts - * at reset time anymore, let's do it now. - */ - ret = kvmppc_xive_source_reset_one(&xive->source, i, &local_err); - if (ret < 0) { - goto fail; - } - ret = kvmppc_xive_set_source_config(xive, i, &xive->eat[i], &local_err); if (ret < 0) { goto fail; From acbdb9956fe93f4669141f103cb543d3025775db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 20 Aug 2020 15:45:46 +0200 Subject: [PATCH 10/33] spapr/xive: Allocate IPIs independently from the other sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vCPU IPIs are now allocated in kvmppc_xive_cpu_connect() when the vCPU connects to the KVM device and not when all the sources are reset in kvmppc_xive_source_reset() This requires extra care for hotplug vCPUs and VM restore. Signed-off-by: Cédric Le Goater Message-Id: <20200820134547.2355743-4-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive_kvm.c | 47 +++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 3e80ea0ce9..507637d51e 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -146,6 +146,15 @@ int kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp) return s.ret; } +static int kvmppc_xive_reset_ipi(SpaprXive *xive, CPUState *cs, Error **errp) +{ + unsigned long ipi = kvm_arch_vcpu_id(cs); + uint64_t state = 0; + + return kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SOURCE, ipi, + &state, true, errp); +} + int kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp) { ERRP_GUARD(); @@ -175,6 +184,12 @@ int kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp) return ret; } + /* Create/reset the vCPU IPI */ + ret = kvmppc_xive_reset_ipi(xive, tctx->cs, errp); + if (ret < 0) { + return ret; + } + kvm_cpu_enable(tctx->cs); return 0; } @@ -234,6 +249,12 @@ int kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp) assert(xive->fd != -1); + /* + * The vCPU IPIs are now allocated in kvmppc_xive_cpu_connect() + * and not with all sources in kvmppc_xive_source_reset() + */ + assert(srcno >= SPAPR_XIRQ_BASE); + if (xive_source_irq_is_lsi(xsrc, srcno)) { state |= KVM_XIVE_LEVEL_SENSITIVE; if (xsrc->status[srcno] & XIVE_STATUS_ASSERTED) { @@ -245,12 +266,28 @@ int kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp) true, errp); } +/* + * To be valid, a source must have been claimed by the machine (valid + * entry in the EAS table) and if it is a vCPU IPI, the vCPU should + * have been enabled, which means the IPI has been allocated in + * kvmppc_xive_cpu_connect(). + */ +static bool xive_source_is_valid(SpaprXive *xive, int i) +{ + return xive_eas_is_valid(&xive->eat[i]) && + (i >= SPAPR_XIRQ_BASE || kvm_cpu_is_enabled(i)); +} + static int kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp) { SpaprXive *xive = SPAPR_XIVE(xsrc->xive); int i; - for (i = 0; i < xsrc->nr_irqs; i++) { + /* + * Skip the vCPU IPIs. These are created/reset when the vCPUs are + * connected in kvmppc_xive_cpu_connect() + */ + for (i = SPAPR_XIRQ_BASE; i < xsrc->nr_irqs; i++) { int ret; if (!xive_eas_is_valid(&xive->eat[i])) { @@ -332,7 +369,7 @@ static void kvmppc_xive_source_get_state(XiveSource *xsrc) for (i = 0; i < xsrc->nr_irqs; i++) { uint8_t pq; - if (!xive_eas_is_valid(&xive->eat[i])) { + if (!xive_source_is_valid(xive, i)) { continue; } @@ -515,7 +552,7 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running, uint8_t pq; uint8_t old_pq; - if (!xive_eas_is_valid(&xive->eat[i])) { + if (!xive_source_is_valid(xive, i)) { continue; } @@ -543,7 +580,7 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running, for (i = 0; i < xsrc->nr_irqs; i++) { uint8_t pq; - if (!xive_eas_is_valid(&xive->eat[i])) { + if (!xive_source_is_valid(xive, i)) { continue; } @@ -658,7 +695,7 @@ int kvmppc_xive_post_load(SpaprXive *xive, int version_id) /* Restore the EAT */ for (i = 0; i < xive->nr_irqs; i++) { - if (!xive_eas_is_valid(&xive->eat[i])) { + if (!xive_source_is_valid(xive, i)) { continue; } From eab0a2d06e97ebcf74e8437aa985eee5df3a4c68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 20 Aug 2020 15:45:47 +0200 Subject: [PATCH 11/33] spapr/xive: Allocate vCPU IPIs from the vCPU contexts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When QEMU switches to the XIVE interrupt mode, it creates all the guest interrupts at the level of the KVM device. These interrupts are backed by real HW interrupts from the IPI interrupt pool of the XIVE controller. Currently, this is done from the QEMU main thread, which results in allocating all interrupts from the chip on which QEMU is running. IPIs are not distributed across the system and the load is not well balanced across the interrupt controllers. Change the vCPU IPI allocation to run from the vCPU context. The associated XIVE IPI interrupt will be allocated on the chip on which the vCPU is running and improve distribution of the IPIs in the system. When the vCPUs are pinned, this will make the IPI local to the chip of the vCPU. It will reduce rerouting between interrupt controllers and gives better performance. Device interrupts are still treated the same. To improve placement, we would need some information on the chip owning the virtual source or the HW source in case of a passthrough device but this reuires changes in PAPR. Signed-off-by: Cédric Le Goater Message-Id: <20200820134547.2355743-5-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive_kvm.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 507637d51e..66bf4c06fe 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -146,13 +146,43 @@ int kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp) return s.ret; } -static int kvmppc_xive_reset_ipi(SpaprXive *xive, CPUState *cs, Error **errp) +/* + * Allocate the vCPU IPIs from the vCPU context. This will allocate + * the XIVE IPI interrupt on the chip on which the vCPU is running. + * This gives a better distribution of IPIs when the guest has a lot + * of vCPUs. When the vCPUs are pinned, this will make the IPI local + * to the chip of the vCPU. It will reduce rerouting between interrupt + * controllers and gives better performance. + */ +typedef struct { + SpaprXive *xive; + Error *err; + int rc; +} XiveInitIPI; + +static void kvmppc_xive_reset_ipi_on_cpu(CPUState *cs, run_on_cpu_data arg) { unsigned long ipi = kvm_arch_vcpu_id(cs); + XiveInitIPI *s = arg.host_ptr; uint64_t state = 0; - return kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SOURCE, ipi, - &state, true, errp); + s->rc = kvm_device_access(s->xive->fd, KVM_DEV_XIVE_GRP_SOURCE, ipi, + &state, true, &s->err); +} + +static int kvmppc_xive_reset_ipi(SpaprXive *xive, CPUState *cs, Error **errp) +{ + XiveInitIPI s = { + .xive = xive, + .err = NULL, + .rc = 0, + }; + + run_on_cpu(cs, kvmppc_xive_reset_ipi_on_cpu, RUN_ON_CPU_HOST_PTR(&s)); + if (s.err) { + error_propagate(errp, s.err); + } + return s.rc; } int kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp) From 90d282d0858cf5a38f3e8a7e201aeab2a0ccbe88 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Tue, 25 Aug 2020 18:57:47 -0300 Subject: [PATCH 12/33] ppc/spapr_nvdimm: use g_autofree in spapr_nvdimm_validate_opts() Since we're using the string just once, just use g_autofree and avoid leaking it without calling g_free(). Signed-off-by: Daniel Henrique Barboza Message-Id: <20200825215749.213536-2-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_nvdimm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c index 81410aa63f..9a20a65640 100644 --- a/hw/ppc/spapr_nvdimm.c +++ b/hw/ppc/spapr_nvdimm.c @@ -33,7 +33,7 @@ void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, uint64_t size, Error **errp) { - char *uuidstr = NULL; + g_autofree char *uuidstr = NULL; QemuUUID uuid; int ret; @@ -54,7 +54,6 @@ void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, uint64_t size, &error_abort); ret = qemu_uuid_parse(uuidstr, &uuid); g_assert(!ret); - g_free(uuidstr); if (qemu_uuid_is_null(&uuid)) { error_setg(errp, "NVDIMM device requires the uuid to be set"); From beb6073fe7f53526bc49f9cb1a3a830c2cdaf3e7 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Tue, 25 Aug 2020 18:57:48 -0300 Subject: [PATCH 13/33] spapr, spapr_nvdimm: fold NVDIMM validation in the same place NVDIMM has different contraints and conditions than the regular DIMM and we'll need to add at least one more. Instead of relying on 'if (nvdimm)' conditionals in the body of spapr_memory_pre_plug(), use the existing spapr_nvdimm_validate_opts() and put all NVDIMM handling code there. Rename it to spapr_nvdimm_validate() to reflect that the function is now checking more than the nvdimm device options. This makes spapr_memory_pre_plug() a bit easier to follow, and we can tune in NVDIMM parameters and validation in the same place. Signed-off-by: Daniel Henrique Barboza Message-Id: <20200825215749.213536-3-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 18 ++++++------------ hw/ppc/spapr_nvdimm.c | 10 ++++++++-- include/hw/ppc/spapr_nvdimm.h | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index dd2fa4826b..b0a04443fb 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -3520,7 +3520,6 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, { const SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev); SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev); - const MachineClass *mc = MACHINE_CLASS(smc); bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM); PCDIMMDevice *dimm = PC_DIMM(dev); Error *local_err = NULL; @@ -3533,27 +3532,22 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, return; } - if (is_nvdimm && !mc->nvdimm_supported) { - error_setg(errp, "NVDIMM hotplug not supported for this machine"); - return; - } - size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &local_err); if (local_err) { error_propagate(errp, local_err); return; } - if (!is_nvdimm && size % SPAPR_MEMORY_BLOCK_SIZE) { - error_setg(errp, "Hotplugged memory size must be a multiple of " - "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB); - return; - } else if (is_nvdimm) { - spapr_nvdimm_validate_opts(NVDIMM(dev), size, &local_err); + if (is_nvdimm) { + spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, &local_err); if (local_err) { error_propagate(errp, local_err); return; } + } else if (size % SPAPR_MEMORY_BLOCK_SIZE) { + error_setg(errp, "Hotplugged memory size must be a multiple of " + "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB); + return; } memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP, diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c index 9a20a65640..bc2b65420c 100644 --- a/hw/ppc/spapr_nvdimm.c +++ b/hw/ppc/spapr_nvdimm.c @@ -30,13 +30,19 @@ #include "hw/ppc/fdt.h" #include "qemu/range.h" -void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, uint64_t size, - Error **errp) +void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, + uint64_t size, Error **errp) { + const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev); g_autofree char *uuidstr = NULL; QemuUUID uuid; int ret; + if (!mc->nvdimm_supported) { + error_setg(errp, "NVDIMM hotplug not supported for this machine"); + return; + } + if (object_property_get_int(OBJECT(nvdimm), NVDIMM_LABEL_SIZE_PROP, &error_abort) == 0) { error_setg(errp, "PAPR requires NVDIMM devices to have label-size set"); diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h index b3330cc485..fd1736634c 100644 --- a/include/hw/ppc/spapr_nvdimm.h +++ b/include/hw/ppc/spapr_nvdimm.h @@ -29,8 +29,8 @@ int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, void *fdt, int *fdt_start_offset, Error **errp); int spapr_dt_nvdimm(void *fdt, int parent_offset, NVDIMMDevice *nvdimm); void spapr_dt_persistent_memory(void *fdt); -void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, uint64_t size, - Error **errp); +void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, + uint64_t size, Error **errp); void spapr_add_nvdimm(DeviceState *dev, uint64_t slot, Error **errp); void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr); From 28f5a716212b891eef3eadc2c5053c0e0c512e1d Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Tue, 25 Aug 2020 18:57:49 -0300 Subject: [PATCH 14/33] ppc/spapr_nvdimm: do not enable support with 'nvdimm=off' The NVDIMM support for pSeries was introduced in 5.1, but it didn't contemplate the 'nvdimm' machine option that other archs uses. For every other arch, if no '-machine nvdimm(=on)' is present, it is assumed that the NVDIMM support is disabled. The user must explictly inform that the machine supports NVDIMM. For pseries-5.1 the 'nvdimm' option is completely ignored, and support is always assumed to exist. This leads to situations where the user is able to set 'nvdimm=off' but the guest boots up with the NVDIMMs anyway. Fixing this now, after 5.1 launch, can put the overall NVDIMM support for pseries in a strange place regarding this 'nvdimm' machine option. If we force everything to be like other archs, existing pseries-5.1 guests that didn't use 'nvdimm' to use NVDIMM devices will break. If we attempt to make the newer pseries machines (5.2+) behave like everyone else, but keep pseries-5.1 untouched, we'll have consistency problems on machine upgrade (5.1 will have different default values for NVDIMM support than 5.2). The common ground here is, if the user sets 'nvdimm=off', we must comply regardless of being 5.1 or 5.2+. This patch changes spapr_nvdimm_validate() to verify if the user set NVDIMM support off in the machine options and, in that case, error out if we have a NVDIMM device. The default value for 5.2+ pseries machines will still be 'nvdimm=on' when there is no 'nvdimm' option declared, just like it is today with pseries-5.1. In the end we'll have different default semantics from everyone else in the absence of the 'nvdimm' machine option, but this boat has sailed. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1848887 Signed-off-by: Daniel Henrique Barboza Message-Id: <20200825215749.213536-4-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_nvdimm.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c index bc2b65420c..95cbc30528 100644 --- a/hw/ppc/spapr_nvdimm.c +++ b/hw/ppc/spapr_nvdimm.c @@ -27,13 +27,17 @@ #include "hw/ppc/spapr_nvdimm.h" #include "hw/mem/nvdimm.h" #include "qemu/nvdimm-utils.h" +#include "qemu/option.h" #include "hw/ppc/fdt.h" #include "qemu/range.h" +#include "sysemu/sysemu.h" void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, uint64_t size, Error **errp) { const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev); + const MachineState *ms = MACHINE(hotplug_dev); + const char *nvdimm_opt = qemu_opt_get(qemu_get_machine_opts(), "nvdimm"); g_autofree char *uuidstr = NULL; QemuUUID uuid; int ret; @@ -43,6 +47,20 @@ void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, return; } + /* + * NVDIMM support went live in 5.1 without considering that, in + * other archs, the user needs to enable NVDIMM support with the + * 'nvdimm' machine option and the default behavior is NVDIMM + * support disabled. It is too late to roll back to the standard + * behavior without breaking 5.1 guests. What we can do is to + * ensure that, if the user sets nvdimm=off, we error out + * regardless of being 5.1 or newer. + */ + if (!ms->nvdimms_state->is_enabled && nvdimm_opt) { + error_setg(errp, "nvdimm device found but 'nvdimm=off' was set"); + return; + } + if (object_property_get_int(OBJECT(nvdimm), NVDIMM_LABEL_SIZE_PROP, &error_abort) == 0) { error_setg(errp, "PAPR requires NVDIMM devices to have label-size set"); From c1b701587e59d9569c38d1d6033cd7cc2a992105 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Wed, 26 Aug 2020 02:55:28 -0300 Subject: [PATCH 15/33] target/arm: Move start-powered-off property to generic CPUState MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are other platforms which also have CPUs that start powered off, so generalize the start-powered-off property so that it can be used by them. Note that ARMv7MState also has a property of the same name but this patch doesn't change it because that class isn't a subclass of CPUState so it wouldn't be a trivial change. This change should not cause any change in behavior. Suggested-by: Eduardo Habkost Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: David Gibson Reviewed-by: Greg Kurz Signed-off-by: Thiago Jung Bauermann Message-Id: <20200826055535.951207-2-bauerman@linux.ibm.com> Signed-off-by: David Gibson --- exec.c | 1 + include/hw/core/cpu.h | 4 ++++ target/arm/cpu.c | 5 ++--- target/arm/cpu.h | 3 --- target/arm/kvm32.c | 2 +- target/arm/kvm64.c | 2 +- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/exec.c b/exec.c index 7683afb6a8..e34b602bdf 100644 --- a/exec.c +++ b/exec.c @@ -899,6 +899,7 @@ Property cpu_common_props[] = { DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION, MemoryRegion *), #endif + DEFINE_PROP_BOOL("start-powered-off", CPUState, start_powered_off, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index 8f145733ce..9fc2696db5 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -374,6 +374,10 @@ struct CPUState { bool created; bool stop; bool stopped; + + /* Should CPU start in powered-off state? */ + bool start_powered_off; + bool unplug; bool crash_occurred; bool exit_request; diff --git a/target/arm/cpu.c b/target/arm/cpu.c index c179e0752d..9f814194fb 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -174,8 +174,8 @@ static void arm_cpu_reset(DeviceState *dev) env->vfp.xregs[ARM_VFP_MVFR1] = cpu->isar.mvfr1; env->vfp.xregs[ARM_VFP_MVFR2] = cpu->isar.mvfr2; - cpu->power_state = cpu->start_powered_off ? PSCI_OFF : PSCI_ON; - s->halted = cpu->start_powered_off; + cpu->power_state = s->start_powered_off ? PSCI_OFF : PSCI_ON; + s->halted = s->start_powered_off; if (arm_feature(env, ARM_FEATURE_IWMMXT)) { env->iwmmxt.cregs[ARM_IWMMXT_wCID] = 0x69051000 | 'Q'; @@ -2186,7 +2186,6 @@ static const ARMCPUInfo arm_cpus[] = { }; static Property arm_cpu_properties[] = { - DEFINE_PROP_BOOL("start-powered-off", ARMCPU, start_powered_off, false), DEFINE_PROP_UINT32("psci-conduit", ARMCPU, psci_conduit, 0), DEFINE_PROP_UINT64("midr", ARMCPU, midr, 0), DEFINE_PROP_UINT64("mp-affinity", ARMCPU, diff --git a/target/arm/cpu.h b/target/arm/cpu.h index a1c7d8ebae..6036f61d60 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -817,9 +817,6 @@ struct ARMCPU { */ uint32_t psci_version; - /* Should CPU start in PSCI powered-off state? */ - bool start_powered_off; - /* Current power state, access guarded by BQL */ ARMPSCIState power_state; diff --git a/target/arm/kvm32.c b/target/arm/kvm32.c index 0af46b41c8..1f2b8f8b7a 100644 --- a/target/arm/kvm32.c +++ b/target/arm/kvm32.c @@ -218,7 +218,7 @@ int kvm_arch_init_vcpu(CPUState *cs) /* Determine init features for this CPU */ memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features)); - if (cpu->start_powered_off) { + if (cs->start_powered_off) { cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_POWER_OFF; } if (kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PSCI_0_2)) { diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index ef1e960285..987b35e33f 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -774,7 +774,7 @@ int kvm_arch_init_vcpu(CPUState *cs) /* Determine init features for this CPU */ memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features)); - if (cpu->start_powered_off) { + if (cs->start_powered_off) { cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_POWER_OFF; } if (kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PSCI_0_2)) { From 6ad1da667c8e21f019d4adc21702e06dd9225790 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Wed, 26 Aug 2020 02:55:29 -0300 Subject: [PATCH 16/33] target/arm: Move setting of CPU halted state to generic code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change is in a separate patch because it's not so obvious that it won't cause a regression. Suggested-by: Eduardo Habkost Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: David Gibson Reviewed-by: Greg Kurz Signed-off-by: Thiago Jung Bauermann Message-Id: <20200826055535.951207-3-bauerman@linux.ibm.com> Signed-off-by: David Gibson --- hw/core/cpu.c | 2 +- target/arm/cpu.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/core/cpu.c b/hw/core/cpu.c index 22bc3f974a..8f65383ffb 100644 --- a/hw/core/cpu.c +++ b/hw/core/cpu.c @@ -258,7 +258,7 @@ static void cpu_common_reset(DeviceState *dev) } cpu->interrupt_request = 0; - cpu->halted = 0; + cpu->halted = cpu->start_powered_off; cpu->mem_io_pc = 0; cpu->icount_extra = 0; atomic_set(&cpu->icount_decr_ptr->u32, 0); diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 9f814194fb..6b4e708c08 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -175,7 +175,6 @@ static void arm_cpu_reset(DeviceState *dev) env->vfp.xregs[ARM_VFP_MVFR2] = cpu->isar.mvfr2; cpu->power_state = s->start_powered_off ? PSCI_OFF : PSCI_ON; - s->halted = s->start_powered_off; if (arm_feature(env, ARM_FEATURE_IWMMXT)) { env->iwmmxt.cregs[ARM_IWMMXT_wCID] = 0x69051000 | 'Q'; From 554c2169e9251ca2829ab968bd9ba5641a5abe1d Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Wed, 26 Aug 2020 02:55:30 -0300 Subject: [PATCH 17/33] ppc/spapr: Use start-powered-off CPUState property PowerPC sPAPR CPUs start in the halted state, and spapr_reset_vcpu() attempts to implement this by setting CPUState::halted to 1. But that's too late for the case of hotplugged CPUs in a machine configure with 2 or more threads per core. By then, other parts of QEMU have already caused the vCPU to run in an unitialized state a couple of times. For example, ppc_cpu_reset() calls ppc_tlb_invalidate_all(), which ends up calling async_run_on_cpu(). This kicks the new vCPU while it has CPUState::halted = 0, causing QEMU to issue a KVM_RUN ioctl on the new vCPU before the guest is able to make the start-cpu RTAS call to initialize its register state. This problem doesn't seem to cause visible issues for regular guests, but on a secure guest running under the Ultravisor it does. The Ultravisor relies on being able to snoop on the start-cpu RTAS call to map vCPUs to guests, and this issue causes it to see a stray vCPU that doesn't belong to any guest. Fix by setting the start-powered-off CPUState property in spapr_create_vcpu(), which makes cpu_common_reset() initialize CPUState::halted to 1 at an earlier moment. Suggested-by: Eduardo Habkost Acked-by: David Gibson Reviewed-by: Greg Kurz Signed-off-by: Thiago Jung Bauermann Message-Id: <20200826055535.951207-4-bauerman@linux.ibm.com> Signed-off-by: David Gibson --- hw/ppc/spapr_cpu_core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index c4f47dcc04..2125fdac34 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -36,11 +36,6 @@ static void spapr_reset_vcpu(PowerPCCPU *cpu) cpu_reset(cs); - /* All CPUs start halted. CPU0 is unhalted from the machine level - * reset code and the rest are explicitly started up by the guest - * using an RTAS call */ - cs->halted = 1; - env->spr[SPR_HIOR] = 0; lpcr = env->spr[SPR_LPCR]; @@ -274,6 +269,11 @@ static PowerPCCPU *spapr_create_vcpu(SpaprCpuCore *sc, int i, Error **errp) cs = CPU(obj); cpu = POWERPC_CPU(obj); + /* + * All CPUs start halted. CPU0 is unhalted from the machine level reset code + * and the rest are explicitly started up by the guest using an RTAS call. + */ + cs->start_powered_off = true; cs->cpu_index = cc->core_id + i; spapr_set_vcpu_id(cpu, cs->cpu_index, &local_err); if (local_err) { From a2c93f06cf63c29448f60285b7e103bc07cf014b Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Wed, 26 Aug 2020 02:55:31 -0300 Subject: [PATCH 18/33] ppc/e500: Use start-powered-off CPUState property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of setting CPUState::halted to 1 in ppce500_cpu_reset_sec(), use the start-powered-off property which makes cpu_common_reset() initialize it to 1 in common code. Also change creation of CPU object from cpu_create() to object_new() and qdev_realize_and_unref() because cpu_create() realizes the CPU and it's not possible to set a property after the object is realized. Signed-off-by: Thiago Jung Bauermann Message-Id: <20200826055535.951207-5-bauerman@linux.ibm.com> Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/ppc/e500.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index ab9884e315..ae39b9358e 100644 --- a/hw/ppc/e500.c +++ b/hw/ppc/e500.c @@ -704,9 +704,6 @@ static void ppce500_cpu_reset_sec(void *opaque) cpu_reset(cs); - /* Secondary CPU starts in halted state for now. Needs to change when - implementing non-kernel boot. */ - cs->halted = 1; cs->exception_index = EXCP_HLT; } @@ -865,7 +862,7 @@ void ppce500_init(MachineState *machine) CPUState *cs; qemu_irq *input; - cpu = POWERPC_CPU(cpu_create(machine->cpu_type)); + cpu = POWERPC_CPU(object_new(machine->cpu_type)); env = &cpu->env; cs = CPU(cpu); @@ -875,6 +872,14 @@ void ppce500_init(MachineState *machine) exit(1); } + /* + * Secondary CPU starts in halted state for now. Needs to change + * when implementing non-kernel boot. + */ + object_property_set_bool(OBJECT(cs), "start-powered-off", i != 0, + &error_fatal); + qdev_realize_and_unref(DEVICE(cs), NULL, &error_fatal); + if (!firstenv) { firstenv = env; } From 102ca9667d1e813c305ff84e6d07408258717ba9 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Wed, 26 Aug 2020 02:55:32 -0300 Subject: [PATCH 19/33] mips/cps: Use start-powered-off CPUState property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of setting CPUState::halted to 1 in main_cpu_reset(), use the start-powered-off property which makes cpu_common_reset() initialize it to 1 in common code. Also change creation of CPU object from cpu_create() to object_new() and qdev_realize_and_unref() because cpu_create() realizes the CPU and it's not possible to set a property after the object is realized. Signed-off-by: Thiago Jung Bauermann Message-Id: <20200826055535.951207-6-bauerman@linux.ibm.com> Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/mips/cps.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/hw/mips/cps.c b/hw/mips/cps.c index 615e1a1ad2..23c0f87e41 100644 --- a/hw/mips/cps.c +++ b/hw/mips/cps.c @@ -52,9 +52,6 @@ static void main_cpu_reset(void *opaque) CPUState *cs = CPU(cpu); cpu_reset(cs); - - /* All VPs are halted on reset. Leave powering up to CPC. */ - cs->halted = 1; } static bool cpu_mips_itu_supported(CPUMIPSState *env) @@ -76,7 +73,17 @@ static void mips_cps_realize(DeviceState *dev, Error **errp) bool saar_present = false; for (i = 0; i < s->num_vp; i++) { - cpu = MIPS_CPU(cpu_create(s->cpu_type)); + cpu = MIPS_CPU(object_new(s->cpu_type)); + + /* All VPs are halted on reset. Leave powering up to CPC. */ + if (!object_property_set_bool(OBJECT(cpu), "start-powered-off", true, + errp)) { + return; + } + + if (!qdev_realize_and_unref(DEVICE(cpu), NULL, errp)) { + return; + } /* Init internal devices */ cpu_mips_irq_init_cpu(cpu); From 3d0c102092bc6096c278784484605096aa4f2384 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Wed, 26 Aug 2020 02:55:33 -0300 Subject: [PATCH 20/33] sparc/sun4m: Don't set cs->halted = 0 in main_cpu_reset() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We rely on cpu_common_reset() to set cs->halted to 0, it's redundant to do it in main_cpu_reset(). Signed-off-by: Thiago Jung Bauermann Message-Id: <20200826055535.951207-7-bauerman@linux.ibm.com> Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/sparc/sun4m.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/sparc/sun4m.c b/hw/sparc/sun4m.c index cf7dfa4af5..7484aa4438 100644 --- a/hw/sparc/sun4m.c +++ b/hw/sparc/sun4m.c @@ -224,7 +224,6 @@ static void main_cpu_reset(void *opaque) CPUState *cs = CPU(cpu); cpu_reset(cs); - cs->halted = 0; } static void secondary_cpu_reset(void *opaque) From 24f675cd3b1406f3af9f9b811bb6946d0e2e6ecf Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Wed, 26 Aug 2020 02:55:34 -0300 Subject: [PATCH 21/33] sparc/sun4m: Use start-powered-off CPUState property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of setting CPUState::halted to 1 in secondary_cpu_reset(), use the start-powered-off property which makes cpu_common_reset() initialize it to 1 in common code. Now secondary_cpu_reset() becomes equivalent to main_cpu_reset() so rename the function to sun4m_cpu_reset(). Also remove setting of cs->halted from cpu_devinit(), which seems out of place when compared to similar code in other architectures (e.g., ppce500_init() in hw/ppc/e500.c). Finally, change creation of CPU object from cpu_create() to object_new() and qdev_realize_and_unref() because cpu_create() realizes the CPU and it's not possible to set a property after the object is realized. Suggested-by: Philippe Mathieu-Daudé Signed-off-by: Thiago Jung Bauermann Message-Id: <20200826055535.951207-8-bauerman@linux.ibm.com> Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/sparc/sun4m.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/hw/sparc/sun4m.c b/hw/sparc/sun4m.c index 7484aa4438..6bf9d27d8a 100644 --- a/hw/sparc/sun4m.c +++ b/hw/sparc/sun4m.c @@ -218,7 +218,7 @@ static void dummy_cpu_set_irq(void *opaque, int irq, int level) { } -static void main_cpu_reset(void *opaque) +static void sun4m_cpu_reset(void *opaque) { SPARCCPU *cpu = opaque; CPUState *cs = CPU(cpu); @@ -226,15 +226,6 @@ static void main_cpu_reset(void *opaque) cpu_reset(cs); } -static void secondary_cpu_reset(void *opaque) -{ - SPARCCPU *cpu = opaque; - CPUState *cs = CPU(cpu); - - cpu_reset(cs); - cs->halted = 1; -} - static void cpu_halt_signal(void *opaque, int irq, int level) { if (level && current_cpu) { @@ -818,21 +809,17 @@ static const TypeInfo ram_info = { static void cpu_devinit(const char *cpu_type, unsigned int id, uint64_t prom_addr, qemu_irq **cpu_irqs) { - CPUState *cs; SPARCCPU *cpu; CPUSPARCState *env; - cpu = SPARC_CPU(cpu_create(cpu_type)); + cpu = SPARC_CPU(object_new(cpu_type)); env = &cpu->env; cpu_sparc_set_id(env, id); - if (id == 0) { - qemu_register_reset(main_cpu_reset, cpu); - } else { - qemu_register_reset(secondary_cpu_reset, cpu); - cs = CPU(cpu); - cs->halted = 1; - } + qemu_register_reset(sun4m_cpu_reset, cpu); + object_property_set_bool(OBJECT(cpu), "start-powered-off", id != 0, + &error_fatal); + qdev_realize_and_unref(DEVICE(cpu), NULL, &error_fatal); *cpu_irqs = qemu_allocate_irqs(cpu_set_irq, cpu, MAX_PILS); env->prom_addr = prom_addr; } From 86c5e6aba6c08a81986fc8c82bfce477288ede48 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Wed, 26 Aug 2020 02:55:35 -0300 Subject: [PATCH 22/33] target/s390x: Use start-powered-off CPUState property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of setting CPUState::halted to 1 in s390_cpu_initfn(), use the start-powered-off property which makes cpu_common_reset() initialize it to 1 in common code. Note that this changes behavior by setting cs->halted to 1 on reset, which didn't happen before. Acked-by: Cornelia Huck Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Thiago Jung Bauermann Message-Id: <20200826055535.951207-9-bauerman@linux.ibm.com> [dwg: Fix from Laurent Vivier for user only case] Signed-off-by: David Gibson --- target/s390x/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c index 08eb674d22..749cd548f0 100644 --- a/target/s390x/cpu.c +++ b/target/s390x/cpu.c @@ -291,9 +291,9 @@ static void s390_cpu_initfn(Object *obj) S390CPU *cpu = S390_CPU(obj); cpu_set_cpustate_pointers(cpu); - cs->halted = 1; cs->exception_index = EXCP_HLT; #if !defined(CONFIG_USER_ONLY) + cs->start_powered_off = true; object_property_add(obj, "crash-information", "GuestPanicInformation", s390_cpu_get_crash_info_qom, NULL, NULL, NULL); cpu->env.tod_timer = From 4192920cbc962770d54cd932748f913a7b1fe3c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 1 Sep 2020 12:40:41 +0200 Subject: [PATCH 23/33] hw/ppc/ppc4xx_pci: Use ARRAY_SIZE() instead of magic value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the magic '4' by ARRAY_SIZE(s->irq) which is more explicit. Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200901104043.91383-4-f4bug@amsat.org> Reviewed-by: Richard Henderson Reviewed-by: Cédric Le Goater Signed-off-by: David Gibson --- hw/ppc/ppc4xx_pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/ppc/ppc4xx_pci.c b/hw/ppc/ppc4xx_pci.c index 3ea47df71f..cd3f192a13 100644 --- a/hw/ppc/ppc4xx_pci.c +++ b/hw/ppc/ppc4xx_pci.c @@ -320,7 +320,8 @@ static void ppc4xx_pcihost_realize(DeviceState *dev, Error **errp) b = pci_register_root_bus(dev, NULL, ppc4xx_pci_set_irq, ppc4xx_pci_map_irq, s->irq, get_system_memory(), - get_system_io(), 0, 4, TYPE_PCI_BUS); + get_system_io(), 0, ARRAY_SIZE(s->irq), + TYPE_PCI_BUS); h->bus = b; pci_create_simple(b, 0, "ppc4xx-host-bridge"); From f5f239315cbf047aef51022ecb083fc7662c660a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 1 Sep 2020 12:40:42 +0200 Subject: [PATCH 24/33] hw/ppc/ppc4xx_pci: Replace pointless warning by assert() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We call pci_register_root_bus() to register 4 IRQs with the ppc4xx_pci_set_irq() handler. As it can only be called with values in the [0-4[ range, replace the pointless warning by an assert(). Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200901104043.91383-5-f4bug@amsat.org> Reviewed-by: Richard Henderson Signed-off-by: David Gibson --- hw/ppc/ppc4xx_pci.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/hw/ppc/ppc4xx_pci.c b/hw/ppc/ppc4xx_pci.c index cd3f192a13..503ef46b39 100644 --- a/hw/ppc/ppc4xx_pci.c +++ b/hw/ppc/ppc4xx_pci.c @@ -256,10 +256,7 @@ static void ppc4xx_pci_set_irq(void *opaque, int irq_num, int level) qemu_irq *pci_irqs = opaque; trace_ppc4xx_pci_set_irq(irq_num); - if (irq_num < 0) { - fprintf(stderr, "%s: PCI irq %d\n", __func__, irq_num); - return; - } + assert(irq_num >= 0); qemu_set_irq(pci_irqs[irq_num], level); } From 1eee9950264867232a8771e60f3955b84acf538f Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Tue, 1 Sep 2020 09:56:39 -0300 Subject: [PATCH 25/33] ppc: introducing spapr_numa.c NUMA code helper We're going to make changes in how spapr handles all ibm,associativity* related properties to enhance our current NUMA support. At this moment we have associativity code scattered all around spapr_* files, with hardcoded values and array sizes. This makes it harder to change any NUMA specific parameters in the future. Having everything in the same place allows not only for easier tuning, but also easier understanding since all NUMA related code is on the same file. This patch introduces a new file to gather all NUMA/associativity handling code in spapr, spapr_numa.c. To get things started, let's remove associativity-reference-points and max-associativity-domains code from spapr_dt_rtas() to a new helper called spapr_numa_write_rtas_dt(). This will decouple spapr_dt_rtas() from the NUMA changes that are going to happen in those two properties. Signed-off-by: Daniel Henrique Barboza Message-Id: <20200901125645.118026-2-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/meson.build | 3 ++- hw/ppc/spapr.c | 26 ++----------------- hw/ppc/spapr_numa.c | 50 +++++++++++++++++++++++++++++++++++++ include/hw/ppc/spapr_numa.h | 20 +++++++++++++++ 4 files changed, 74 insertions(+), 25 deletions(-) create mode 100644 hw/ppc/spapr_numa.c create mode 100644 include/hw/ppc/spapr_numa.h diff --git a/hw/ppc/meson.build b/hw/ppc/meson.build index 918969b320..ffa2ec37fa 100644 --- a/hw/ppc/meson.build +++ b/hw/ppc/meson.build @@ -25,7 +25,8 @@ ppc_ss.add(when: 'CONFIG_PSERIES', if_true: files( 'spapr_irq.c', 'spapr_tpm_proxy.c', 'spapr_nvdimm.c', - 'spapr_rtas_ddw.c' + 'spapr_rtas_ddw.c', + 'spapr_numa.c', )) ppc_ss.add(when: 'CONFIG_SPAPR_RNG', if_true: files('spapr_rng.c')) ppc_ss.add(when: ['CONFIG_PSERIES', 'CONFIG_LINUX'], if_true: files( diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index b0a04443fb..a45912acac 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -81,6 +81,7 @@ #include "hw/mem/memory-device.h" #include "hw/ppc/spapr_tpm_proxy.h" #include "hw/ppc/spapr_nvdimm.h" +#include "hw/ppc/spapr_numa.h" #include "monitor/monitor.h" @@ -891,16 +892,9 @@ static int spapr_dt_rng(void *fdt) static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) { MachineState *ms = MACHINE(spapr); - SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms); int rtas; GString *hypertas = g_string_sized_new(256); GString *qemu_hypertas = g_string_sized_new(256); - uint32_t refpoints[] = { - cpu_to_be32(0x4), - cpu_to_be32(0x4), - cpu_to_be32(0x2), - }; - uint32_t nr_refpoints = ARRAY_SIZE(refpoints); uint64_t max_device_addr = MACHINE(spapr)->device_memory->base + memory_region_size(&MACHINE(spapr)->device_memory->mr); uint32_t lrdr_capacity[] = { @@ -910,14 +904,6 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE & 0xffffffff), cpu_to_be32(ms->smp.max_cpus / ms->smp.threads), }; - uint32_t maxdomain = cpu_to_be32(spapr->gpu_numa_id > 1 ? 1 : 0); - uint32_t maxdomains[] = { - cpu_to_be32(4), - maxdomain, - maxdomain, - maxdomain, - cpu_to_be32(spapr->gpu_numa_id), - }; _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas")); @@ -953,15 +939,7 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) qemu_hypertas->str, qemu_hypertas->len)); g_string_free(qemu_hypertas, TRUE); - if (smc->pre_5_1_assoc_refpoints) { - nr_refpoints = 2; - } - - _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", - refpoints, nr_refpoints * sizeof(refpoints[0]))); - - _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains", - maxdomains, sizeof(maxdomains))); + spapr_numa_write_rtas_dt(spapr, fdt, rtas); /* * FWNMI reserves RTAS_ERROR_LOG_MAX for the machine check error log, diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c new file mode 100644 index 0000000000..cdf3288cbd --- /dev/null +++ b/hw/ppc/spapr_numa.c @@ -0,0 +1,50 @@ +/* + * QEMU PowerPC pSeries Logical Partition NUMA associativity handling + * + * Copyright IBM Corp. 2020 + * + * Authors: + * Daniel Henrique Barboza + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "hw/ppc/spapr_numa.h" +#include "hw/ppc/fdt.h" + +/* + * Helper that writes ibm,associativity-reference-points and + * max-associativity-domains in the RTAS pointed by @rtas + * in the DT @fdt. + */ +void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) +{ + SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); + uint32_t refpoints[] = { + cpu_to_be32(0x4), + cpu_to_be32(0x4), + cpu_to_be32(0x2), + }; + uint32_t nr_refpoints = ARRAY_SIZE(refpoints); + uint32_t maxdomain = cpu_to_be32(spapr->gpu_numa_id > 1 ? 1 : 0); + uint32_t maxdomains[] = { + cpu_to_be32(4), + maxdomain, + maxdomain, + maxdomain, + cpu_to_be32(spapr->gpu_numa_id), + }; + + if (smc->pre_5_1_assoc_refpoints) { + nr_refpoints = 2; + } + + _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", + refpoints, nr_refpoints * sizeof(refpoints[0]))); + + _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains", + maxdomains, sizeof(maxdomains))); +} diff --git a/include/hw/ppc/spapr_numa.h b/include/hw/ppc/spapr_numa.h new file mode 100644 index 0000000000..7a370a8768 --- /dev/null +++ b/include/hw/ppc/spapr_numa.h @@ -0,0 +1,20 @@ +/* + * QEMU PowerPC pSeries Logical Partition NUMA associativity handling + * + * Copyright IBM Corp. 2020 + * + * Authors: + * Daniel Henrique Barboza + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_SPAPR_NUMA_H +#define HW_SPAPR_NUMA_H + +#include "hw/ppc/spapr.h" + +void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas); + +#endif /* HW_SPAPR_NUMA_H */ From 6ee1d62e6af84c5ccda06763f2e58fb3f41ba106 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Tue, 1 Sep 2020 09:56:40 -0300 Subject: [PATCH 26/33] ppc/spapr_nvdimm: turn spapr_dt_nvdimm() static This function is only used inside spapr_nvdimm.c. Signed-off-by: Daniel Henrique Barboza Message-Id: <20200901125645.118026-3-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_nvdimm.c | 22 +++++++++++----------- include/hw/ppc/spapr_nvdimm.h | 1 - 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c index 95cbc30528..5188e2f503 100644 --- a/hw/ppc/spapr_nvdimm.c +++ b/hw/ppc/spapr_nvdimm.c @@ -106,16 +106,6 @@ void spapr_add_nvdimm(DeviceState *dev, uint64_t slot, Error **errp) } } -int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, - void *fdt, int *fdt_start_offset, Error **errp) -{ - NVDIMMDevice *nvdimm = NVDIMM(drc->dev); - - *fdt_start_offset = spapr_dt_nvdimm(fdt, 0, nvdimm); - - return 0; -} - void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr) { MachineState *machine = MACHINE(spapr); @@ -127,7 +117,7 @@ void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr) } -int spapr_dt_nvdimm(void *fdt, int parent_offset, +static int spapr_dt_nvdimm(void *fdt, int parent_offset, NVDIMMDevice *nvdimm) { int child_offset; @@ -184,6 +174,16 @@ int spapr_dt_nvdimm(void *fdt, int parent_offset, return child_offset; } +int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, + void *fdt, int *fdt_start_offset, Error **errp) +{ + NVDIMMDevice *nvdimm = NVDIMM(drc->dev); + + *fdt_start_offset = spapr_dt_nvdimm(fdt, 0, nvdimm); + + return 0; +} + void spapr_dt_persistent_memory(void *fdt) { int offset = fdt_subnode_offset(fdt, 0, "persistent-memory"); diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h index fd1736634c..10a6d9dbbc 100644 --- a/include/hw/ppc/spapr_nvdimm.h +++ b/include/hw/ppc/spapr_nvdimm.h @@ -27,7 +27,6 @@ QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE); int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, void *fdt, int *fdt_start_offset, Error **errp); -int spapr_dt_nvdimm(void *fdt, int parent_offset, NVDIMMDevice *nvdimm); void spapr_dt_persistent_memory(void *fdt); void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, uint64_t size, Error **errp); From f1aa45fffeeb084a9ad8bd08e83c5ec6af223884 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Thu, 3 Sep 2020 19:06:33 -0300 Subject: [PATCH 27/33] spapr: introduce SpaprMachineState::numa_assoc_array The next step to centralize all NUMA/associativity handling in the spapr machine is to create a 'one stop place' for all things ibm,associativity. This patch introduces numa_assoc_array, a 2 dimensional array that will store all ibm,associativity arrays of all NUMA nodes. This array is initialized in a new spapr_numa_associativity_init() function, called in spapr_machine_init(). It is being initialized with the same values used in other ibm,associativity properties around spapr files (i.e. all zeros, last value is node_id). The idea is to remove all hardcoded definitions and FDT writes of ibm,associativity arrays, doing instead a call to the new helper spapr_numa_write_associativity_dt() helper, that will be able to write the DT with the correct values. We'll start small, handling the trivial cases first. The remaining instances of ibm,associativity will be handled next. Signed-off-by: Daniel Henrique Barboza Message-Id: <20200903220639.563090-2-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 23 ++++++++++------------- hw/ppc/spapr_numa.c | 30 ++++++++++++++++++++++++++++++ hw/ppc/spapr_nvdimm.c | 19 +++++++------------ hw/ppc/spapr_pci.c | 9 ++------- include/hw/ppc/spapr.h | 12 ++++++++++++ include/hw/ppc/spapr_numa.h | 11 +++++++++++ include/hw/ppc/spapr_nvdimm.h | 2 +- 7 files changed, 73 insertions(+), 33 deletions(-) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index a45912acac..1ad6f59863 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -314,14 +314,9 @@ static void add_str(GString *s, const gchar *s1) g_string_append_len(s, s1, strlen(s1) + 1); } -static int spapr_dt_memory_node(void *fdt, int nodeid, hwaddr start, - hwaddr size) +static int spapr_dt_memory_node(SpaprMachineState *spapr, void *fdt, int nodeid, + hwaddr start, hwaddr size) { - uint32_t associativity[] = { - cpu_to_be32(0x4), /* length */ - cpu_to_be32(0x0), cpu_to_be32(0x0), - cpu_to_be32(0x0), cpu_to_be32(nodeid) - }; char mem_name[32]; uint64_t mem_reg_property[2]; int off; @@ -335,8 +330,7 @@ static int spapr_dt_memory_node(void *fdt, int nodeid, hwaddr start, _FDT((fdt_setprop_string(fdt, off, "device_type", "memory"))); _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property, sizeof(mem_reg_property)))); - _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity, - sizeof(associativity)))); + spapr_numa_write_associativity_dt(spapr, fdt, off, nodeid); return off; } @@ -649,7 +643,7 @@ static int spapr_dt_memory(SpaprMachineState *spapr, void *fdt) if (!mem_start) { /* spapr_machine_init() checks for rma_size <= node0_size * already */ - spapr_dt_memory_node(fdt, i, 0, spapr->rma_size); + spapr_dt_memory_node(spapr, fdt, i, 0, spapr->rma_size); mem_start += spapr->rma_size; node_size -= spapr->rma_size; } @@ -661,7 +655,7 @@ static int spapr_dt_memory(SpaprMachineState *spapr, void *fdt) sizetmp = 1ULL << ctzl(mem_start); } - spapr_dt_memory_node(fdt, i, mem_start, sizetmp); + spapr_dt_memory_node(spapr, fdt, i, mem_start, sizetmp); node_size -= sizetmp; mem_start += sizetmp; } @@ -1275,7 +1269,7 @@ void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space) /* NVDIMM devices */ if (mc->nvdimm_supported) { - spapr_dt_persistent_memory(fdt); + spapr_dt_persistent_memory(spapr, fdt); } return fdt; @@ -2810,6 +2804,9 @@ static void spapr_machine_init(MachineState *machine) */ spapr->gpu_numa_id = MAX(1, machine->numa_state->num_nodes); + /* Init numa_assoc_array */ + spapr_numa_associativity_init(spapr, machine); + if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) && ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0, spapr->max_compat_pvr)) { @@ -3394,7 +3391,7 @@ int spapr_lmb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, addr = spapr_drc_index(drc) * SPAPR_MEMORY_BLOCK_SIZE; node = object_property_get_uint(OBJECT(drc->dev), PC_DIMM_NODE_PROP, &error_abort); - *fdt_start_offset = spapr_dt_memory_node(fdt, node, addr, + *fdt_start_offset = spapr_dt_memory_node(spapr, fdt, node, addr, SPAPR_MEMORY_BLOCK_SIZE); return 0; } diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index cdf3288cbd..f6b6fe648f 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -15,6 +15,36 @@ #include "hw/ppc/spapr_numa.h" #include "hw/ppc/fdt.h" + +void spapr_numa_associativity_init(SpaprMachineState *spapr, + MachineState *machine) +{ + int nb_numa_nodes = machine->numa_state->num_nodes; + int i; + + /* + * For all associativity arrays: first position is the size, + * position MAX_DISTANCE_REF_POINTS is always the numa_id, + * represented by the index 'i'. + * + * This will break on sparse NUMA setups, when/if QEMU starts + * to support it, because there will be no more guarantee that + * 'i' will be a valid node_id set by the user. + */ + for (i = 0; i < nb_numa_nodes; i++) { + spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); + spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); + } +} + +void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, + int offset, int nodeid) +{ + _FDT((fdt_setprop(fdt, offset, "ibm,associativity", + spapr->numa_assoc_array[nodeid], + sizeof(spapr->numa_assoc_array[nodeid])))); +} + /* * Helper that writes ibm,associativity-reference-points and * max-associativity-domains in the RTAS pointed by @rtas diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c index 5188e2f503..63872054f3 100644 --- a/hw/ppc/spapr_nvdimm.c +++ b/hw/ppc/spapr_nvdimm.c @@ -31,6 +31,7 @@ #include "hw/ppc/fdt.h" #include "qemu/range.h" #include "sysemu/sysemu.h" +#include "hw/ppc/spapr_numa.h" void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, uint64_t size, Error **errp) @@ -117,8 +118,8 @@ void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr) } -static int spapr_dt_nvdimm(void *fdt, int parent_offset, - NVDIMMDevice *nvdimm) +static int spapr_dt_nvdimm(SpaprMachineState *spapr, void *fdt, + int parent_offset, NVDIMMDevice *nvdimm) { int child_offset; char *buf; @@ -128,11 +129,6 @@ static int spapr_dt_nvdimm(void *fdt, int parent_offset, &error_abort); uint64_t slot = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_SLOT_PROP, &error_abort); - uint32_t associativity[] = { - cpu_to_be32(0x4), /* length */ - cpu_to_be32(0x0), cpu_to_be32(0x0), - cpu_to_be32(0x0), cpu_to_be32(node) - }; uint64_t lsize = nvdimm->label_size; uint64_t size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP, NULL); @@ -152,8 +148,7 @@ static int spapr_dt_nvdimm(void *fdt, int parent_offset, _FDT((fdt_setprop_string(fdt, child_offset, "compatible", "ibm,pmemory"))); _FDT((fdt_setprop_string(fdt, child_offset, "device_type", "ibm,pmemory"))); - _FDT((fdt_setprop(fdt, child_offset, "ibm,associativity", associativity, - sizeof(associativity)))); + spapr_numa_write_associativity_dt(spapr, fdt, child_offset, node); buf = qemu_uuid_unparse_strdup(&nvdimm->uuid); _FDT((fdt_setprop_string(fdt, child_offset, "ibm,unit-guid", buf))); @@ -179,12 +174,12 @@ int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, { NVDIMMDevice *nvdimm = NVDIMM(drc->dev); - *fdt_start_offset = spapr_dt_nvdimm(fdt, 0, nvdimm); + *fdt_start_offset = spapr_dt_nvdimm(spapr, fdt, 0, nvdimm); return 0; } -void spapr_dt_persistent_memory(void *fdt) +void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt) { int offset = fdt_subnode_offset(fdt, 0, "persistent-memory"); GSList *iter, *nvdimms = nvdimm_get_device_list(); @@ -202,7 +197,7 @@ void spapr_dt_persistent_memory(void *fdt) for (iter = nvdimms; iter; iter = iter->next) { NVDIMMDevice *nvdimm = iter->data; - spapr_dt_nvdimm(fdt, offset, nvdimm); + spapr_dt_nvdimm(spapr, fdt, offset, nvdimm); } g_slist_free(nvdimms); diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 0a418f1e67..4d97ff6c70 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -52,6 +52,7 @@ #include "sysemu/kvm.h" #include "sysemu/hostmem.h" #include "sysemu/numa.h" +#include "hw/ppc/spapr_numa.h" /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */ #define RTAS_QUERY_FN 0 @@ -2321,11 +2322,6 @@ int spapr_dt_phb(SpaprMachineState *spapr, SpaprPhbState *phb, cpu_to_be32(1), cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) }; - uint32_t associativity[] = {cpu_to_be32(0x4), - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(phb->numa_node)}; SpaprTceTable *tcet; SpaprDrc *drc; Error *err = NULL; @@ -2358,8 +2354,7 @@ int spapr_dt_phb(SpaprMachineState *spapr, SpaprPhbState *phb, /* Advertise NUMA via ibm,associativity */ if (phb->numa_node != -1) { - _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity, - sizeof(associativity))); + spapr_numa_write_associativity_dt(spapr, fdt, bus_off, phb->numa_node); } /* Build the interrupt-map, this must matches what is done diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index a1e230ad39..9a63380801 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -105,6 +105,16 @@ typedef enum { #define FDT_MAX_SIZE 0x100000 +/* + * NUMA related macros. MAX_DISTANCE_REF_POINTS was taken + * from Taken from Linux kernel arch/powerpc/mm/numa.h. + * + * NUMA_ASSOC_SIZE is the base array size of an ibm,associativity + * array for any non-CPU resource. + */ +#define MAX_DISTANCE_REF_POINTS 4 +#define NUMA_ASSOC_SIZE (MAX_DISTANCE_REF_POINTS + 1) + typedef struct SpaprCapabilities SpaprCapabilities; struct SpaprCapabilities { uint8_t caps[SPAPR_CAP_NUM]; @@ -231,6 +241,8 @@ struct SpaprMachineState { unsigned gpu_numa_id; SpaprTpmProxy *tpm_proxy; + uint32_t numa_assoc_array[MAX_NODES][NUMA_ASSOC_SIZE]; + Error *fwnmi_migration_blocker; }; diff --git a/include/hw/ppc/spapr_numa.h b/include/hw/ppc/spapr_numa.h index 7a370a8768..a2a4df55f7 100644 --- a/include/hw/ppc/spapr_numa.h +++ b/include/hw/ppc/spapr_numa.h @@ -13,8 +13,19 @@ #ifndef HW_SPAPR_NUMA_H #define HW_SPAPR_NUMA_H +#include "hw/boards.h" #include "hw/ppc/spapr.h" +/* + * Having both SpaprMachineState and MachineState as arguments + * feels odd, but it will spare a MACHINE() call inside the + * function. spapr_machine_init() is the only caller for it, and + * it has both pointers resolved already. + */ +void spapr_numa_associativity_init(SpaprMachineState *spapr, + MachineState *machine); void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas); +void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, + int offset, int nodeid); #endif /* HW_SPAPR_NUMA_H */ diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h index 10a6d9dbbc..3eb344e8e9 100644 --- a/include/hw/ppc/spapr_nvdimm.h +++ b/include/hw/ppc/spapr_nvdimm.h @@ -27,7 +27,7 @@ QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE); int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, void *fdt, int *fdt_start_offset, Error **errp); -void spapr_dt_persistent_memory(void *fdt); +void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt); void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, uint64_t size, Error **errp); void spapr_add_nvdimm(DeviceState *dev, uint64_t slot, Error **errp); From 8f86a408241264db605da9a1215a409475a60bd0 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Thu, 3 Sep 2020 19:06:34 -0300 Subject: [PATCH 28/33] spapr, spapr_numa: handle vcpu ibm,associativity Vcpus have an additional paramenter to be appended, vcpu_id. This also changes the size of the of property itself, which is being represented in index 0 of numa_assoc_array[cpu->node_id], and defaults to MAX_DISTANCE_REF_POINTS for all cases but vcpus. All this logic makes more sense in spapr_numa.c, where we handle everything NUMA and associativity. A new helper spapr_numa_fixup_cpu_dt() was added, and spapr.c uses it the same way as it was using the former spapr_fixup_cpu_numa_dt(). Signed-off-by: Daniel Henrique Barboza Message-Id: <20200903220639.563090-3-danielhb413@gmail.com> [dwg: Correct uint to int type, which can break windows builds] Signed-off-by: David Gibson --- hw/ppc/spapr.c | 17 +---------------- hw/ppc/spapr_numa.c | 27 +++++++++++++++++++++++++++ include/hw/ppc/spapr_numa.h | 2 ++ 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 1ad6f59863..badfa86319 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -202,21 +202,6 @@ static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu, return ret; } -static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, PowerPCCPU *cpu) -{ - int index = spapr_get_vcpu_id(cpu); - uint32_t associativity[] = {cpu_to_be32(0x5), - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(cpu->node_id), - cpu_to_be32(index)}; - - /* Advertise NUMA via ibm,associativity */ - return fdt_setprop(fdt, offset, "ibm,associativity", associativity, - sizeof(associativity)); -} - static void spapr_dt_pa_features(SpaprMachineState *spapr, PowerPCCPU *cpu, void *fdt, int offset) @@ -785,7 +770,7 @@ static void spapr_dt_cpu(CPUState *cs, void *fdt, int offset, pft_size_prop, sizeof(pft_size_prop)))); if (ms->numa_state->num_nodes > 1) { - _FDT(spapr_fixup_cpu_numa_dt(fdt, offset, cpu)); + _FDT(spapr_numa_fixup_cpu_dt(spapr, fdt, offset, cpu)); } _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt)); diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index f6b6fe648f..523801fb14 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -45,6 +45,33 @@ void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, sizeof(spapr->numa_assoc_array[nodeid])))); } +int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, + int offset, PowerPCCPU *cpu) +{ + int vcpu_assoc_size = NUMA_ASSOC_SIZE + 1; + uint32_t vcpu_assoc[vcpu_assoc_size]; + int index = spapr_get_vcpu_id(cpu); + int i; + + /* + * VCPUs have an extra 'cpu_id' value in ibm,associativity + * compared to other resources. Increment the size at index + * 0, copy all associativity domains already set, then put + * cpu_id last. + */ + vcpu_assoc[0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS + 1); + + for (i = 1; i <= MAX_DISTANCE_REF_POINTS; i++) { + vcpu_assoc[i] = spapr->numa_assoc_array[cpu->node_id][i]; + } + + vcpu_assoc[vcpu_assoc_size - 1] = cpu_to_be32(index); + + /* Advertise NUMA via ibm,associativity */ + return fdt_setprop(fdt, offset, "ibm,associativity", + vcpu_assoc, sizeof(vcpu_assoc)); +} + /* * Helper that writes ibm,associativity-reference-points and * max-associativity-domains in the RTAS pointed by @rtas diff --git a/include/hw/ppc/spapr_numa.h b/include/hw/ppc/spapr_numa.h index a2a4df55f7..43c6a16fe3 100644 --- a/include/hw/ppc/spapr_numa.h +++ b/include/hw/ppc/spapr_numa.h @@ -27,5 +27,7 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas); void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, int offset, int nodeid); +int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, + int offset, PowerPCCPU *cpu); #endif /* HW_SPAPR_NUMA_H */ From 0ee520126a2a1a2539e54b6d0879380cb0bf4ea4 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Thu, 3 Sep 2020 19:06:35 -0300 Subject: [PATCH 29/33] spapr, spapr_numa: move lookup-arrays handling to spapr_numa.c In a similar fashion as the previous patch, let's move the handling of ibm,associativity-lookup-arrays from spapr.c to spapr_numa.c. A spapr_numa_write_assoc_lookup_arrays() helper was created, and spapr_dt_dynamic_reconfiguration_memory() can now use it to advertise the lookup-arrays. Signed-off-by: Daniel Henrique Barboza Message-Id: <20200903220639.563090-4-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 25 ++----------------------- hw/ppc/spapr_numa.c | 34 ++++++++++++++++++++++++++++++++++ include/hw/ppc/spapr_numa.h | 2 ++ 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index badfa86319..9bce1892b5 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -535,13 +535,10 @@ static int spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState *spapr, void *fdt) { MachineState *machine = MACHINE(spapr); - int nb_numa_nodes = machine->numa_state->num_nodes; - int ret, i, offset; + int ret, offset; uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE; uint32_t prop_lmb_size[] = {cpu_to_be32(lmb_size >> 32), cpu_to_be32(lmb_size & 0xffffffff)}; - uint32_t *int_buf, *cur_index, buf_len; - int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1; MemoryDeviceInfoList *dimms = NULL; /* @@ -582,25 +579,7 @@ static int spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState *spapr, return ret; } - /* ibm,associativity-lookup-arrays */ - buf_len = (nr_nodes * 4 + 2) * sizeof(uint32_t); - cur_index = int_buf = g_malloc0(buf_len); - int_buf[0] = cpu_to_be32(nr_nodes); - int_buf[1] = cpu_to_be32(4); /* Number of entries per associativity list */ - cur_index += 2; - for (i = 0; i < nr_nodes; i++) { - uint32_t associativity[] = { - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(i) - }; - memcpy(cur_index, associativity, sizeof(associativity)); - cur_index += 4; - } - ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf, - (cur_index - int_buf) * sizeof(uint32_t)); - g_free(int_buf); + ret = spapr_numa_write_assoc_lookup_arrays(spapr, fdt, offset); return ret; } diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 523801fb14..42394f36e9 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -72,6 +72,40 @@ int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, vcpu_assoc, sizeof(vcpu_assoc)); } + +int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, + int offset) +{ + MachineState *machine = MACHINE(spapr); + int nb_numa_nodes = machine->numa_state->num_nodes; + int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1; + uint32_t *int_buf, *cur_index, buf_len; + int ret, i; + + /* ibm,associativity-lookup-arrays */ + buf_len = (nr_nodes * MAX_DISTANCE_REF_POINTS + 2) * sizeof(uint32_t); + cur_index = int_buf = g_malloc0(buf_len); + int_buf[0] = cpu_to_be32(nr_nodes); + /* Number of entries per associativity list */ + int_buf[1] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); + cur_index += 2; + for (i = 0; i < nr_nodes; i++) { + /* + * For the lookup-array we use the ibm,associativity array, + * from numa_assoc_array. without the first element (size). + */ + uint32_t *associativity = spapr->numa_assoc_array[i]; + memcpy(cur_index, ++associativity, + sizeof(uint32_t) * MAX_DISTANCE_REF_POINTS); + cur_index += MAX_DISTANCE_REF_POINTS; + } + ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf, + (cur_index - int_buf) * sizeof(uint32_t)); + g_free(int_buf); + + return ret; +} + /* * Helper that writes ibm,associativity-reference-points and * max-associativity-domains in the RTAS pointed by @rtas diff --git a/include/hw/ppc/spapr_numa.h b/include/hw/ppc/spapr_numa.h index 43c6a16fe3..b3fd950634 100644 --- a/include/hw/ppc/spapr_numa.h +++ b/include/hw/ppc/spapr_numa.h @@ -29,5 +29,7 @@ void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, int offset, int nodeid); int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, int offset, PowerPCCPU *cpu); +int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, + int offset); #endif /* HW_SPAPR_NUMA_H */ From dd7e1d7ae431ee61b0e59bf505fc39dc17bba24b Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Thu, 3 Sep 2020 19:06:36 -0300 Subject: [PATCH 30/33] spapr_numa: move NVLink2 associativity handling to spapr_numa.c The NVLink2 GPUs works like a regular NUMA node with its own associativity values, regardless of user input. This can be handled inside spapr_numa_associativity_init(), initializing NVGPU_MAX_NUM associativity arrays that can be used by the GPUs. Signed-off-by: Daniel Henrique Barboza Message-Id: <20200903220639.563090-5-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_numa.c | 28 +++++++++++++++++++++++++++- hw/ppc/spapr_pci_nvlink2.c | 20 +++----------------- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 42394f36e9..9073dbbceb 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -13,14 +13,18 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "hw/ppc/spapr_numa.h" +#include "hw/pci-host/spapr.h" #include "hw/ppc/fdt.h" +/* Moved from hw/ppc/spapr_pci_nvlink2.c */ +#define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) void spapr_numa_associativity_init(SpaprMachineState *spapr, MachineState *machine) { + SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); int nb_numa_nodes = machine->numa_state->num_nodes; - int i; + int i, j, max_nodes_with_gpus; /* * For all associativity arrays: first position is the size, @@ -35,6 +39,28 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); } + + /* + * Initialize NVLink GPU associativity arrays. We know that + * the first GPU will take the first available NUMA id, and + * we'll have a maximum of NVGPU_MAX_NUM GPUs in the machine. + * At this point we're not sure if there are GPUs or not, but + * let's initialize the associativity arrays and allow NVLink + * GPUs to be handled like regular NUMA nodes later on. + */ + max_nodes_with_gpus = nb_numa_nodes + NVGPU_MAX_NUM; + + for (i = nb_numa_nodes; i < max_nodes_with_gpus; i++) { + spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); + + for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { + uint32_t gpu_assoc = smc->pre_5_1_assoc_refpoints ? + SPAPR_GPU_NUMA_ID : cpu_to_be32(i); + spapr->numa_assoc_array[i][j] = gpu_assoc; + } + + spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); + } } void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c index 76ae77ebc8..8ef9b40a18 100644 --- a/hw/ppc/spapr_pci_nvlink2.c +++ b/hw/ppc/spapr_pci_nvlink2.c @@ -26,6 +26,7 @@ #include "qemu-common.h" #include "hw/pci/pci.h" #include "hw/pci-host/spapr.h" +#include "hw/ppc/spapr_numa.h" #include "qemu/error-report.h" #include "hw/ppc/fdt.h" #include "hw/pci/pci_bridge.h" @@ -37,8 +38,6 @@ #define PHANDLE_NVLINK(phb, gn, nn) (0x00130000 | (((phb)->index) << 8) | \ ((gn) << 4) | (nn)) -#define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) - typedef struct SpaprPhbPciNvGpuSlot { uint64_t tgt; uint64_t gpa; @@ -360,13 +359,6 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt) Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev), "nvlink2-mr[0]", &error_abort); - uint32_t associativity[] = { - cpu_to_be32(0x4), - cpu_to_be32(nvslot->numa_id), - cpu_to_be32(nvslot->numa_id), - cpu_to_be32(nvslot->numa_id), - cpu_to_be32(nvslot->numa_id) - }; uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL); uint64_t mem_reg[2] = { cpu_to_be64(nvslot->gpa), cpu_to_be64(size) }; char *mem_name = g_strdup_printf("memory@%"PRIx64, nvslot->gpa); @@ -376,14 +368,8 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt) _FDT((fdt_setprop_string(fdt, off, "device_type", "memory"))); _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg)))); - if (sphb->pre_5_1_assoc) { - associativity[1] = SPAPR_GPU_NUMA_ID; - associativity[2] = SPAPR_GPU_NUMA_ID; - associativity[3] = SPAPR_GPU_NUMA_ID; - } - - _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity, - sizeof(associativity)))); + spapr_numa_write_associativity_dt(SPAPR_MACHINE(qdev_get_machine()), + fdt, off, nvslot->numa_id); _FDT((fdt_setprop_string(fdt, off, "compatible", "ibm,coherent-device-memory"))); From f8a13fc3812562ad46db1d37d9a176613731698f Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Fri, 4 Sep 2020 14:24:20 -0300 Subject: [PATCH 31/33] spapr: move h_home_node_associativity to spapr_numa.c The implementation of this hypercall will be modified to use spapr->numa_assoc_arrays input. Moving it to spapr_numa.c makes make more sense. Reviewed-by: Greg Kurz Signed-off-by: Daniel Henrique Barboza Message-Id: <20200904172422.617460-2-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_hcall.c | 40 --------------------------------------- hw/ppc/spapr_numa.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 40 deletions(-) diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index c1d01228c6..c2776b6a7d 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -1873,42 +1873,6 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, return ret; } -static target_ulong h_home_node_associativity(PowerPCCPU *cpu, - SpaprMachineState *spapr, - target_ulong opcode, - target_ulong *args) -{ - target_ulong flags = args[0]; - target_ulong procno = args[1]; - PowerPCCPU *tcpu; - int idx; - - /* only support procno from H_REGISTER_VPA */ - if (flags != 0x1) { - return H_FUNCTION; - } - - tcpu = spapr_find_cpu(procno); - if (tcpu == NULL) { - return H_P2; - } - - /* sequence is the same as in the "ibm,associativity" property */ - - idx = 0; -#define ASSOCIATIVITY(a, b) (((uint64_t)(a) << 32) | \ - ((uint64_t)(b) & 0xffffffff)) - args[idx++] = ASSOCIATIVITY(0, 0); - args[idx++] = ASSOCIATIVITY(0, tcpu->node_id); - args[idx++] = ASSOCIATIVITY(procno, -1); - for ( ; idx < 6; idx++) { - args[idx] = -1; - } -#undef ASSOCIATIVITY - - return H_SUCCESS; -} - static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu, SpaprMachineState *spapr, target_ulong opcode, @@ -2139,10 +2103,6 @@ static void hypercall_register_types(void) spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support); spapr_register_hypercall(KVMPPC_H_UPDATE_DT, h_update_dt); - - /* Virtual Processor Home Node */ - spapr_register_hypercall(H_HOME_NODE_ASSOCIATIVITY, - h_home_node_associativity); } type_init(hypercall_register_types) diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 9073dbbceb..e60587ce81 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -165,3 +165,48 @@ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains", maxdomains, sizeof(maxdomains))); } + +static target_ulong h_home_node_associativity(PowerPCCPU *cpu, + SpaprMachineState *spapr, + target_ulong opcode, + target_ulong *args) +{ + target_ulong flags = args[0]; + target_ulong procno = args[1]; + PowerPCCPU *tcpu; + int idx; + + /* only support procno from H_REGISTER_VPA */ + if (flags != 0x1) { + return H_FUNCTION; + } + + tcpu = spapr_find_cpu(procno); + if (tcpu == NULL) { + return H_P2; + } + + /* sequence is the same as in the "ibm,associativity" property */ + + idx = 0; +#define ASSOCIATIVITY(a, b) (((uint64_t)(a) << 32) | \ + ((uint64_t)(b) & 0xffffffff)) + args[idx++] = ASSOCIATIVITY(0, 0); + args[idx++] = ASSOCIATIVITY(0, tcpu->node_id); + args[idx++] = ASSOCIATIVITY(procno, -1); + for ( ; idx < 6; idx++) { + args[idx] = -1; + } +#undef ASSOCIATIVITY + + return H_SUCCESS; +} + +static void spapr_numa_register_types(void) +{ + /* Virtual Processor Home Node */ + spapr_register_hypercall(H_HOME_NODE_ASSOCIATIVITY, + h_home_node_associativity); +} + +type_init(spapr_numa_register_types) From d370f9cf0aac54b091047392427d3d5ec93182c3 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Fri, 4 Sep 2020 14:24:21 -0300 Subject: [PATCH 32/33] spapr_numa: create a vcpu associativity helper The work to be done in h_home_node_associativity() intersects with what is already done in spapr_numa_fixup_cpu_dt(). This patch creates a new helper, spapr_numa_get_vcpu_assoc(), to be used for both spapr_numa_fixup_cpu_dt() and h_home_node_associativity(). While we're at it, use memcpy() instead of loop assignment to created the returned array. Reviewed-by: Greg Kurz Signed-off-by: Daniel Henrique Barboza Message-Id: <20200904172422.617460-3-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_numa.c | 31 ++++++++++++++++++------------- include/hw/ppc/spapr.h | 7 ++++++- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index e60587ce81..066ffc2a5f 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -71,31 +71,36 @@ void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, sizeof(spapr->numa_assoc_array[nodeid])))); } -int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, - int offset, PowerPCCPU *cpu) +static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr, + PowerPCCPU *cpu) { - int vcpu_assoc_size = NUMA_ASSOC_SIZE + 1; - uint32_t vcpu_assoc[vcpu_assoc_size]; + uint32_t *vcpu_assoc = g_new(uint32_t, VCPU_ASSOC_SIZE); int index = spapr_get_vcpu_id(cpu); - int i; /* * VCPUs have an extra 'cpu_id' value in ibm,associativity * compared to other resources. Increment the size at index - * 0, copy all associativity domains already set, then put - * cpu_id last. + * 0, put cpu_id last, then copy the remaining associativity + * domains. */ vcpu_assoc[0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS + 1); + vcpu_assoc[VCPU_ASSOC_SIZE - 1] = cpu_to_be32(index); + memcpy(vcpu_assoc + 1, spapr->numa_assoc_array[cpu->node_id] + 1, + (VCPU_ASSOC_SIZE - 2) * sizeof(uint32_t)); - for (i = 1; i <= MAX_DISTANCE_REF_POINTS; i++) { - vcpu_assoc[i] = spapr->numa_assoc_array[cpu->node_id][i]; - } + return vcpu_assoc; +} - vcpu_assoc[vcpu_assoc_size - 1] = cpu_to_be32(index); +int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, + int offset, PowerPCCPU *cpu) +{ + g_autofree uint32_t *vcpu_assoc = NULL; + + vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, cpu); /* Advertise NUMA via ibm,associativity */ - return fdt_setprop(fdt, offset, "ibm,associativity", - vcpu_assoc, sizeof(vcpu_assoc)); + return fdt_setprop(fdt, offset, "ibm,associativity", vcpu_assoc, + VCPU_ASSOC_SIZE * sizeof(uint32_t)); } diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index 9a63380801..e50a2672e3 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -107,13 +107,18 @@ typedef enum { /* * NUMA related macros. MAX_DISTANCE_REF_POINTS was taken - * from Taken from Linux kernel arch/powerpc/mm/numa.h. + * from Linux kernel arch/powerpc/mm/numa.h. It represents the + * amount of associativity domains for non-CPU resources. * * NUMA_ASSOC_SIZE is the base array size of an ibm,associativity * array for any non-CPU resource. + * + * VCPU_ASSOC_SIZE represents the size of ibm,associativity array + * for CPUs, which has an extra element (vcpu_id) in the end. */ #define MAX_DISTANCE_REF_POINTS 4 #define NUMA_ASSOC_SIZE (MAX_DISTANCE_REF_POINTS + 1) +#define VCPU_ASSOC_SIZE (NUMA_ASSOC_SIZE + 1) typedef struct SpaprCapabilities SpaprCapabilities; struct SpaprCapabilities { From 876ab8d89d0d288945334c8caa908b07ef847de2 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Fri, 4 Sep 2020 14:24:22 -0300 Subject: [PATCH 33/33] spapr_numa: use spapr_numa_get_vcpu_assoc() in home_node hcall The current implementation of h_home_node_associativity hard codes the values of associativity domains of the vcpus. Let's make it consider the values already initialized in spapr->numa_assoc_array, via the spapr_numa_get_vcpu_assoc() helper. We want to set it and forget it, and for that we also need to assert that we don't overflow the registers of the hypercall. >From R4 to R9 we can squeeze in 12 associativity domains for vcpus, so let's assert that VCPU_ASSOC_SIZE -1 isn't greater than that. Reviewed-by: Greg Kurz Signed-off-by: Daniel Henrique Barboza Message-Id: <20200904172422.617460-4-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_numa.c | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 066ffc2a5f..64fe567f5d 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -176,10 +176,11 @@ static target_ulong h_home_node_associativity(PowerPCCPU *cpu, target_ulong opcode, target_ulong *args) { + g_autofree uint32_t *vcpu_assoc = NULL; target_ulong flags = args[0]; target_ulong procno = args[1]; PowerPCCPU *tcpu; - int idx; + int idx, assoc_idx; /* only support procno from H_REGISTER_VPA */ if (flags != 0x1) { @@ -191,16 +192,40 @@ static target_ulong h_home_node_associativity(PowerPCCPU *cpu, return H_P2; } - /* sequence is the same as in the "ibm,associativity" property */ + /* + * Given that we want to be flexible with the sizes and indexes, + * we must consider that there is a hard limit of how many + * associativities domain we can fit in R4 up to R9, which would be + * 12 associativity domains for vcpus. Assert and bail if that's + * not the case. + */ + G_STATIC_ASSERT((VCPU_ASSOC_SIZE - 1) <= 12); + + vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, tcpu); + /* assoc_idx starts at 1 to skip associativity size */ + assoc_idx = 1; - idx = 0; #define ASSOCIATIVITY(a, b) (((uint64_t)(a) << 32) | \ ((uint64_t)(b) & 0xffffffff)) - args[idx++] = ASSOCIATIVITY(0, 0); - args[idx++] = ASSOCIATIVITY(0, tcpu->node_id); - args[idx++] = ASSOCIATIVITY(procno, -1); - for ( ; idx < 6; idx++) { - args[idx] = -1; + + for (idx = 0; idx < 6; idx++) { + int32_t a, b; + + /* + * vcpu_assoc[] will contain the associativity domains for tcpu, + * including tcpu->node_id and procno, meaning that we don't + * need to use these variables here. + * + * We'll read 2 values at a time to fill up the ASSOCIATIVITY() + * macro. The ternary will fill the remaining registers with -1 + * after we went through vcpu_assoc[]. + */ + a = assoc_idx < VCPU_ASSOC_SIZE ? + be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1; + b = assoc_idx < VCPU_ASSOC_SIZE ? + be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1; + + args[idx] = ASSOCIATIVITY(a, b); } #undef ASSOCIATIVITY