On Thu, Nov 01, 2018 at 10:01:05AM +0000, John Garry wrote: > On 31/10/2018 20:46, Peter Zijlstra wrote: > > > I also note that if I apply the patch, below, to reject the invalid NUMA > > > distance, we're still getting a warning/error: > > > > > > [ 7.144407] CPU: All CPU(s) started at EL2 > > > [ 7.148678] alternatives: patching kernel code > > > [ 7.153557] ERROR: Node-0 not representative > > > [ 7.153557] > > > [ 7.159365] 10 15 20 25 > > > [ 7.162097] 15 10 25 30 > > > [ 7.164832] 20 25 10 15 > > > [ 7.167562] 25 30 15 10 > > > > Yeah, that's an 'obviously' broken topology too. > > > > AFAICT, this conforms to ACPI spec SLIT rules, and the kernel SLIT > validation allows this also. So maybe we should shout louder here or even > mark the SLIT as invalid if totally broken. Right. Part of the problem is that I only have a few necessary conditions (symmetry, trace-identity, node0-complete-distance) for the distance table, but together they are not sufficient to determine if a topology is 'good'. (also, strictly speaking, non-symmetric topologies are possible -- think uni-directional mesh links -- we've just not ever seen and validated those) That said; I can certainly make this code give up and warn harder on those conditions. How is something like the below? --- diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9d74371e4aad..41e703dd875f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1207,6 +1218,11 @@ sd_init(struct sched_domain_topology_level *tl, return sd; } +static const struct cpumask *cpu_system_mask(int cpu) +{ + return cpu_online_mask; +} + /* * Topology list, bottom-up. */ @@ -1337,7 +1353,7 @@ void sched_init_numa(void) int level = 0; int i, j, k; - sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); + sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); if (!sched_domains_numa_distance) return; @@ -1353,39 +1369,22 @@ void sched_init_numa(void) * node_distance(i,j) in order to avoid cubic time. */ next_distance = curr_distance; + for (i = 0; i < nr_node_ids; i++) { for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; - - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); - - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); - } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; - } + int distance = node_distance(0, j); - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; } /* @@ -1428,7 +1427,24 @@ void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; for_each_node(k) { - if (node_distance(j, k) > sched_domains_numa_distance[i]) + int distance = node_distance(j, k); + + if (!i && j != k) { + sched_numa_warn("Non-trace locality\n"); + goto fail; + } + + if (distance > curr_distance) { + sched_numa_warn("Node-0 not complete\n"); + goto fail; + } + + if (distance != node_distance(k, j)) { + sched_numa_warn("Non symmetric\n"); + goto fail; + } + + if (distance > sched_domains_numa_distance[i]) continue; cpumask_or(mask, mask, cpumask_of_node(k)); @@ -1437,9 +1453,10 @@ void sched_init_numa(void) } /* Compute default topology size */ - for (i = 0; sched_domain_topology[i].mask; i++); + for (k = 0; sched_domain_topology[k].mask; k++) + ; - tl = kzalloc((i + level + 1) * + tl = kzalloc((k + level + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -1447,7 +1464,7 @@ void sched_init_numa(void) /* * Copy the default topology bits.. */ - for (i = 0; sched_domain_topology[i].mask; i++) + for (i = 0; i < k; i++) tl[i] = sched_domain_topology[i]; /* @@ -1478,6 +1495,31 @@ void sched_init_numa(void) sched_max_numa_distance = sched_domains_numa_distance[level - 1]; init_numa_topology_type(); + + return; + +fail: + /* Compute default topology size */ + for (k = 0; sched_domain_topology[k].mask; k++) + ; + + tl = kzalloc((k + 2) * + sizeof(struct sched_domain_topology_level), GFP_KERNEL); + if (!tl) + return; + + /* + * Copy the default topology bits.. + */ + for (i = 0; i < k; i++) + tl[i] = sched_domain_topology[i]; + + tl[i] = (struct sched_domain_topology_level){ + .mask = cpu_system_mask, + SD_INIT_NAME(SYSTEM) + }; + + sched_domain_topology = tl; } void sched_domains_numa_masks_set(unsigned int cpu)