On 19/04/23 22:19, Yury Norov wrote: > +/* > + * sched_numa_find_next_cpu() - given the NUMA topology, find the next cpu > + * cpumask: cpumask to find a cpu from > + * cpu: current cpu > + * node: local node > + * hop: (in/out) indicates distance order of current CPU to a local node > + * > + * The function searches for next cpu at a given NUMA distance, indicated > + * by hop, and if nothing found, tries to find CPUs at a greater distance, > + * starting from the beginning. > + * > + * Return: cpu, or >= nr_cpu_ids when nothing found. > + */ > +int sched_numa_find_next_cpu(const struct cpumask *cpus, int cpu, int node, unsigned int *hop) > +{ > + unsigned long *cur, *prev; > + struct cpumask ***masks; > + unsigned int ret; > + > + if (*hop >= sched_domains_numa_levels) > + return nr_cpu_ids; > + > + masks = rcu_dereference(sched_domains_numa_masks); > + cur = cpumask_bits(masks[*hop][node]); > + if (*hop == 0) > + ret = find_next_and_bit(cpumask_bits(cpus), cur, nr_cpu_ids, cpu); > + else { > + prev = cpumask_bits(masks[*hop - 1][node]); > + ret = find_next_and_andnot_bit(cpumask_bits(cpus), cur, prev, nr_cpu_ids, cpu); > + } > + > + if (ret < nr_cpu_ids) > + return ret; > + > + *hop += 1; > + return sched_numa_find_next_cpu(cpus, 0, node, hop); sched_domains_numa_levels is a fairly small number, so the recursion depth isn't something we really need to worry about - still, the iterative variant of this is fairly straightforward to get to: diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index e850f16c003ae..4c9a9e48fef6d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2151,23 +2151,27 @@ int sched_numa_find_next_cpu(const struct cpumask *cpus, int cpu, int node, unsi struct cpumask ***masks; unsigned int ret; - if (*hop >= sched_domains_numa_levels) - return nr_cpu_ids; + /* + * Reset @cpu to 0 when increasing @hop, since CPU numbering has no + * relationship with NUMA distance: a search at @hop+1 may yield CPUs + * of lower ID than previously seen! + */ + for (; *hop >= sched_domains_numa_levels; *hop += 1, cpu = 0) { + masks = rcu_dereference(sched_domains_numa_masks); + cur = cpumask_bits(masks[*hop][node]); + + if (*hop == 0) { + ret = find_next_and_bit(cpumask_bits(cpus), cur, nr_cpu_ids, cpu); + } else { + prev = cpumask_bits(masks[*hop - 1][node]); + ret = find_next_and_andnot_bit(cpumask_bits(cpus), cur, prev, nr_cpu_ids, cpu); + } - masks = rcu_dereference(sched_domains_numa_masks); - cur = cpumask_bits(masks[*hop][node]); - if (*hop == 0) - ret = find_next_and_bit(cpumask_bits(cpus), cur, nr_cpu_ids, cpu); - else { - prev = cpumask_bits(masks[*hop - 1][node]); - ret = find_next_and_andnot_bit(cpumask_bits(cpus), cur, prev, nr_cpu_ids, cpu); + if (ret < nr_cpu_ids) + return ret; } - if (ret < nr_cpu_ids) - return ret; - - *hop += 1; - return sched_numa_find_next_cpu(cpus, 0, node, hop); + return nr_cpu_ids; } EXPORT_SYMBOL_GPL(sched_numa_find_next_cpu);