From ed26b92290768818371fbfd4317988eab6009ad5 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Thu, 12 Feb 2015 15:23:48 -0200 Subject: [PATCH 1/6] numa: Fix off-by-one error at MAX_CPUMASK_BITS check Fix the CPU index check to ensure we don't go beyond the size of the node_cpu bitmap. CPU index is always less than MAX_CPUMASK_BITS, as documented at sysemu.h: > The following shall be true for all CPUs: > cpu->cpu_index < max_cpus <= MAX_CPUMASK_BITS Reviewed-by: Igor Mammedov Signed-off-by: Eduardo Habkost --- numa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numa.c b/numa.c index ffbec68fd8..13b2f01dba 100644 --- a/numa.c +++ b/numa.c @@ -76,9 +76,9 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) } for (cpus = node->cpus; cpus; cpus = cpus->next) { - if (cpus->value > MAX_CPUMASK_BITS) { + if (cpus->value >= MAX_CPUMASK_BITS) { error_setg(errp, "CPU number %" PRIu16 " is bigger than %d", - cpus->value, MAX_CPUMASK_BITS); + cpus->value, MAX_CPUMASK_BITS - 1); return; } bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); From 8979c945c1a7ffd20edbd5da2513c04baccfd7de Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Mon, 9 Feb 2015 17:28:52 -0200 Subject: [PATCH 2/6] numa: Reject CPU indexes > max_cpus CPU index is always less than max_cpus, as documented at sysemu.h: > The following shall be true for all CPUs: > cpu->cpu_index < max_cpus <= MAX_CPUMASK_BITS Reject configuration which uses invalid CPU indexes. Reviewed-by: Igor Mammedov Signed-off-by: Eduardo Habkost --- numa.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/numa.c b/numa.c index 13b2f01dba..6b4ab0eae4 100644 --- a/numa.c +++ b/numa.c @@ -76,9 +76,11 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) } for (cpus = node->cpus; cpus; cpus = cpus->next) { - if (cpus->value >= MAX_CPUMASK_BITS) { - error_setg(errp, "CPU number %" PRIu16 " is bigger than %d", - cpus->value, MAX_CPUMASK_BITS - 1); + if (cpus->value >= max_cpus) { + error_setg(errp, + "CPU index (%" PRIu16 ")" + " should be smaller than maxcpus (%d)", + cpus->value, max_cpus); return; } bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); From 3ef7197505e483e2f28c5fbd6ed54b4061221200 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Mon, 9 Feb 2015 17:32:04 -0200 Subject: [PATCH 3/6] numa: Reject configuration if CPU appears on multiple nodes Each CPU can appear in only one NUMA node on the NUMA config. Reject configuration if a CPU appears in multiple nodes. Reviewed-by: Igor Mammedov Signed-off-by: Eduardo Habkost --- numa.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/numa.c b/numa.c index 6b4ab0eae4..518aedd88e 100644 --- a/numa.c +++ b/numa.c @@ -167,6 +167,41 @@ error: return -1; } +static char *enumerate_cpus(unsigned long *cpus, int max_cpus) +{ + int cpu; + bool first = true; + GString *s = g_string_new(NULL); + + for (cpu = find_first_bit(cpus, max_cpus); + cpu < max_cpus; + cpu = find_next_bit(cpus, max_cpus, cpu + 1)) { + g_string_append_printf(s, "%s%d", first ? "" : " ", cpu); + first = false; + } + return g_string_free(s, FALSE); +} + +static void validate_numa_cpus(void) +{ + int i; + DECLARE_BITMAP(seen_cpus, MAX_CPUMASK_BITS); + + bitmap_zero(seen_cpus, MAX_CPUMASK_BITS); + for (i = 0; i < nb_numa_nodes; i++) { + if (bitmap_intersects(seen_cpus, numa_info[i].node_cpu, + MAX_CPUMASK_BITS)) { + bitmap_and(seen_cpus, seen_cpus, + numa_info[i].node_cpu, MAX_CPUMASK_BITS); + error_report("CPU(s) present in multiple NUMA nodes: %s", + enumerate_cpus(seen_cpus, max_cpus));; + exit(EXIT_FAILURE); + } + bitmap_or(seen_cpus, seen_cpus, + numa_info[i].node_cpu, MAX_CPUMASK_BITS); + } +} + void parse_numa_opts(void) { int i; @@ -244,6 +279,8 @@ void parse_numa_opts(void) set_bit(i, numa_info[i % nb_numa_nodes].node_cpu); } } + + validate_numa_cpus(); } } From 57924bcd87cb03cc21ebd7efed880d16ca048dce Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 19 Mar 2015 17:09:21 +0000 Subject: [PATCH 4/6] numa: introduce machine callback for VCPU to node mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current default round-robin way of distributing VCPUs among NUMA nodes might be wrong in case on multi-core/threads CPUs. Making guests confused wrt topology where cores from the same socket are on different nodes. Allow a machine to override default mapping by providing MachineClass::cpu_index_to_socket_id() callback which would allow it group VCPUs from a socket on the same NUMA node. Signed-off-by: Igor Mammedov Reviewed-by: Andreas Färber Signed-off-by: Eduardo Habkost --- include/hw/boards.h | 5 +++++ include/sysemu/numa.h | 3 ++- numa.c | 18 +++++++++++++----- vl.c | 2 +- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/hw/boards.h b/include/hw/boards.h index 1feea2b176..78838d13d4 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -82,6 +82,10 @@ bool machine_mem_merge(MachineState *machine); * of HotplugHandler object, which handles hotplug operation * for a given @dev. It may return NULL if @dev doesn't require * any actions to be performed by hotplug handler. + * @cpu_index_to_socket_id: + * used to provide @cpu_index to socket number mapping, allowing + * a machine to group CPU threads belonging to the same socket/package + * Returns: socket number given cpu_index belongs to. */ struct MachineClass { /*< private >*/ @@ -118,6 +122,7 @@ struct MachineClass { HotplugHandler *(*get_hotplug_handler)(MachineState *machine, DeviceState *dev); + unsigned (*cpu_index_to_socket_id)(unsigned cpu_index); }; /** diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index 5633b856a8..6523b4d7f9 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -6,6 +6,7 @@ #include "qemu/option.h" #include "sysemu/sysemu.h" #include "sysemu/hostmem.h" +#include "hw/boards.h" extern int nb_numa_nodes; /* Number of NUMA nodes */ @@ -16,7 +17,7 @@ typedef struct node_info { bool present; } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; -void parse_numa_opts(void); +void parse_numa_opts(MachineClass *mc); void numa_post_machine_init(void); void query_numa_node_mem(uint64_t node_mem[]); extern QemuOptsList qemu_numa_opts; diff --git a/numa.c b/numa.c index 518aedd88e..fe74e1eeaf 100644 --- a/numa.c +++ b/numa.c @@ -202,7 +202,7 @@ static void validate_numa_cpus(void) } } -void parse_numa_opts(void) +void parse_numa_opts(MachineClass *mc) { int i; @@ -270,13 +270,21 @@ void parse_numa_opts(void) break; } } - /* assigning the VCPUs round-robin is easier to implement, guest OSes - * must cope with this anyway, because there are BIOSes out there in - * real machines which also use this scheme. + /* Historically VCPUs were assigned in round-robin order to NUMA + * nodes. However it causes issues with guest not handling it nice + * in case where cores/threads from a multicore CPU appear on + * different nodes. So allow boards to override default distribution + * rule grouping VCPUs by socket so that VCPUs from the same socket + * would be on the same node. */ if (i == nb_numa_nodes) { for (i = 0; i < max_cpus; i++) { - set_bit(i, numa_info[i % nb_numa_nodes].node_cpu); + unsigned node_id = i % nb_numa_nodes; + if (mc->cpu_index_to_socket_id) { + node_id = mc->cpu_index_to_socket_id(i) % nb_numa_nodes; + } + + set_bit(i, numa_info[node_id].node_cpu); } } diff --git a/vl.c b/vl.c index 69617d640a..75ec292216 100644 --- a/vl.c +++ b/vl.c @@ -4170,7 +4170,7 @@ int main(int argc, char **argv, char **envp) default_drive(default_floppy, snapshot, IF_FLOPPY, 0, FD_OPTS); default_drive(default_sdcard, snapshot, IF_SD, 0, SD_OPTS); - parse_numa_opts(); + parse_numa_opts(machine_class); if (qemu_opts_foreach(qemu_find_opts("mon"), mon_init_func, NULL, 1) != 0) { exit(1); From fb43b73b9225ff2d19cf5350c68112aade7eec13 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 19 Mar 2015 17:09:22 +0000 Subject: [PATCH 5/6] pc: fix default VCPU to NUMA node mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit dd0247e0 pc: acpi: mark all possible CPUs as enabled in SRAT Linux kernel actually tries to use CPU to Node mapping from QEMU provided SRAT table instead of discarding it, and that in some cases breaks build_sched_domains() which expects sane mapping where cores/threads belonging to the same socket are on the same NUMA node. With current default round-robin mapping of VCPUs to nodes guest ends-up with cores/threads belonging to the same socket being on different NUMA nodes. For example with following CLI: qemu-system-x86_64 -m 4G \ -cpu Opteron_G3,vendor=AuthenticAMD \ -smp 5,sockets=1,cores=4,threads=1,maxcpus=8 \ -numa node,nodeid=0 -numa node,nodeid=1 2.6.32 based kernels will hang on boot due to incorrectly built sched_group-s list in update_sd_lb_stats() Replacing default mapping with a manual, where VCPUs belonging to the same socket are on the same NUMA node, fixes the issue for guests which can't handle nonsense topology i.e. changing CLI to: -numa node,nodeid=0,cpus=0-3 -numa node,nodeid=1,cpus=4-7 So instead of simply scattering VCPUs around nodes, provide callback to map the same socket VCPUs to the same NUMA node, which is what guests would expect from a sane hardware/BIOS. Signed-off-by: Igor Mammedov Reviewed-by: Andreas Färber Signed-off-by: Eduardo Habkost --- hw/i386/pc.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 4b46c299c3..a52d2aff7c 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1851,6 +1851,14 @@ static void pc_machine_initfn(Object *obj) NULL, NULL); } +static unsigned pc_cpu_index_to_socket_id(unsigned cpu_index) +{ + unsigned pkg_id, core_id, smt_id; + x86_topo_ids_from_idx(smp_cores, smp_threads, cpu_index, + &pkg_id, &core_id, &smt_id); + return pkg_id; +} + static void pc_machine_class_init(ObjectClass *oc, void *data) { MachineClass *mc = MACHINE_CLASS(oc); @@ -1859,6 +1867,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) pcmc->get_hotplug_handler = mc->get_hotplug_handler; mc->get_hotplug_handler = pc_get_hotpug_handler; + mc->cpu_index_to_socket_id = pc_cpu_index_to_socket_id; hc->plug = pc_machine_device_plug_cb; hc->unplug_request = pc_machine_device_unplug_request_cb; hc->unplug = pc_machine_device_unplug_cb; From 549fc54b8cfe16a475d8f6b8f838e53b45452b4a Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Mon, 9 Feb 2015 17:35:04 -0200 Subject: [PATCH 6/6] numa: Print warning if no node is assigned to a CPU We need all possible CPUs (including hotplug ones) to be present in the SRAT when QEMU starts. QEMU already does that correctly today, the only problem is that when a CPU is omitted from the NUMA configuration, it is silently assigned to node 0. Check if all CPUs up to max_cpus are present in the NUMA configuration and warn about missing CPUs. Make it just a warning, to allow management software to be updated if necessary. In the future we may make it a fatal error instead. Command-line examples: * Correct, no warning: $ qemu-system-x86_64 -smp 2,maxcpus=4 $ qemu-system-x86_64 -smp 2,maxcpus=4 -numa node,cpus=0-3 * Incomplete, with warnings: $ qemu-system-x86_64 -smp 2,maxcpus=4 -numa node,cpus=0 qemu-system-x86_64: warning: CPU(s) not present in any NUMA nodes: 1 2 3 qemu-system-x86_64: warning: All CPU(s) up to maxcpus should be described in NUMA config $ qemu-system-x86_64 -smp 2,maxcpus=4 -numa node,cpus=0-2 qemu-system-x86_64: warning: CPU(s) not present in any NUMA nodes: 3 qemu-system-x86_64: warning: All CPU(s) up to maxcpus should be described in NUMA config Signed-off-by: Eduardo Habkost --- v1 -> v2: (no changes) v2 -> v3: * Use enumerate_cpus() and error_report() for error message * Simplify logic using bitmap_full() v3 -> v4: * Clarify error message, mention that all CPUs up to maxcpus need to be described in NUMA config v4 -> v5: * Commit log update, to make problem description clearer --- numa.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/numa.c b/numa.c index fe74e1eeaf..c975fb2682 100644 --- a/numa.c +++ b/numa.c @@ -200,6 +200,16 @@ static void validate_numa_cpus(void) bitmap_or(seen_cpus, seen_cpus, numa_info[i].node_cpu, MAX_CPUMASK_BITS); } + + if (!bitmap_full(seen_cpus, max_cpus)) { + char *msg; + bitmap_complement(seen_cpus, seen_cpus, max_cpus); + msg = enumerate_cpus(seen_cpus, max_cpus); + error_report("warning: CPU(s) not present in any NUMA nodes: %s", msg); + error_report("warning: All CPU(s) up to maxcpus should be described " + "in NUMA config"); + g_free(msg); + } } void parse_numa_opts(MachineClass *mc)