On Fri, Dec 20, 2024 at 04:11:41PM +0100, Andrea Righi wrote: > With the introduction of separate per-NUMA node cpumasks, we > automatically track idle CPUs within each NUMA node. > > This makes the special logic for determining idle CPUs in each NUMA node > redundant and unnecessary, so we can get rid of it. But it looks like you do more than that... > Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx> > --- > kernel/sched/ext_idle.c | 93 ++++++++++------------------------------- > 1 file changed, 23 insertions(+), 70 deletions(-) > > diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c > index 013deaa08f12..b36e93da1b75 100644 > --- a/kernel/sched/ext_idle.c > +++ b/kernel/sched/ext_idle.c > @@ -82,7 +82,6 @@ static void idle_masks_init(void) > } > > static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); > -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); > > /* > * Return the node id associated to a target idle CPU (used to determine > @@ -259,25 +258,6 @@ static unsigned int numa_weight(s32 cpu) > return sg->group_weight; > } > > -/* > - * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA > - * domain is not defined). > - */ > -static struct cpumask *numa_span(s32 cpu) > -{ > - struct sched_domain *sd; > - struct sched_group *sg; > - > - sd = rcu_dereference(per_cpu(sd_numa, cpu)); > - if (!sd) > - return NULL; > - sg = sd->groups; > - if (!sg) > - return NULL; > - > - return sched_group_span(sg); I didn't find llc_span() and node_span() in vanilla kernel. Does this series have prerequisites? > -} > - > /* > * Return true if the LLC domains do not perfectly overlap with the NUMA > * domains, false otherwise. > @@ -329,7 +309,7 @@ static bool llc_numa_mismatch(void) > */ > static void update_selcpu_topology(struct sched_ext_ops *ops) > { > - bool enable_llc = false, enable_numa = false; > + bool enable_llc = false; > unsigned int nr_cpus; > s32 cpu = cpumask_first(cpu_online_mask); > > @@ -348,41 +328,34 @@ static void update_selcpu_topology(struct sched_ext_ops *ops) > if (nr_cpus > 0) { > if (nr_cpus < num_online_cpus()) > enable_llc = true; > + /* > + * No need to enable LLC optimization if the LLC domains are > + * perfectly overlapping with the NUMA domains when per-node > + * cpumasks are enabled. > + */ > + if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && > + !llc_numa_mismatch()) > + enable_llc = false; This doesn't sound like redundancy removal. I may be wrong, but this looks like a sort of optimization. If so, it deserves to be a separate patch. > pr_debug("sched_ext: LLC=%*pb weight=%u\n", > cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); > } > - > - /* > - * Enable NUMA optimization only when there are multiple NUMA domains > - * among the online CPUs and the NUMA domains don't perfectly overlaps > - * with the LLC domains. > - * > - * If all CPUs belong to the same NUMA node and the same LLC domain, > - * enabling both NUMA and LLC optimizations is unnecessary, as checking > - * for an idle CPU in the same domain twice is redundant. > - */ > - nr_cpus = numa_weight(cpu); Neither I found numa_weight()... > - if (nr_cpus > 0) { > - if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) > - enable_numa = true; > - pr_debug("sched_ext: NUMA=%*pb weight=%u\n", > - cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); > - } This calls numa_weight() twice. Get rid of it for good. > rcu_read_unlock(); > > pr_debug("sched_ext: LLC idle selection %s\n", > enable_llc ? "enabled" : "disabled"); > - pr_debug("sched_ext: NUMA idle selection %s\n", > - enable_numa ? "enabled" : "disabled"); > > if (enable_llc) > static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); > else > static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); > - if (enable_numa) > - static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); > + > + /* > + * Check if we need to enable per-node cpumasks. > + */ > + if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) > + static_branch_enable_cpuslocked(&scx_builtin_idle_per_node); This is the key from the whole series! > else > - static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); > + static_branch_disable_cpuslocked(&scx_builtin_idle_per_node); > } This knob enables the whole new machinery, and it definitely deserves to be a separate, very last patch. Now it looks like a sneaky replacement of scx_selcpu_topo_numa with scx_builtin_idle_per_node, and this is wrong. Are you sure you need a comment on top of it? To me, the code is quite self-explaining... > > /* > @@ -405,9 +378,8 @@ static void update_selcpu_topology(struct sched_ext_ops *ops) > * > * 5. Pick any idle CPU usable by the task. > * > - * Step 3 and 4 are performed only if the system has, respectively, multiple > - * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and > - * scx_selcpu_topo_numa). > + * Step 3 is performed only if the system has multiple LLC domains that are not > + * perfectly overlapping with the NUMA domains (see scx_selcpu_topo_llc). > * > * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because > * we never call ops.select_cpu() for them, see select_task_rq(). > @@ -416,7 +388,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, > u64 wake_flags, bool *found) > { > const struct cpumask *llc_cpus = NULL; > - const struct cpumask *numa_cpus = NULL; > int node = idle_cpu_to_node(prev_cpu); > s32 cpu; > > @@ -438,13 +409,9 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, > * CPU affinity), the task will simply use the flat scheduling domain > * defined by user-space. > */ > - if (p->nr_cpus_allowed >= num_possible_cpus()) { > - if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) > - numa_cpus = numa_span(prev_cpu); > - > + if (p->nr_cpus_allowed >= num_possible_cpus()) > if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) > llc_cpus = llc_span(prev_cpu); > - } I'd keep the curve braces. That would minimize your patch and preserve more history. > > /* > * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. > @@ -507,15 +474,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, > goto cpu_found; > } > > - /* > - * Search for any fully idle core in the same NUMA node. > - */ > - if (numa_cpus) { > - cpu = scx_pick_idle_cpu(numa_cpus, node, SCX_PICK_IDLE_CORE); > - if (cpu >= 0) > - goto cpu_found; > - } > - > /* > * Search for any full idle core usable by the task. > * > @@ -545,17 +503,12 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, > goto cpu_found; > } > > - /* > - * Search for any idle CPU in the same NUMA node. > - */ > - if (numa_cpus) { > - cpu = pick_idle_cpu_from_node(numa_cpus, node, 0); > - if (cpu >= 0) > - goto cpu_found; > - } > - > /* > * Search for any idle CPU usable by the task. > + * > + * If NUMA aware idle selection is enabled, the search will begin > + * in prev_cpu's node and proceed to other nodes in order of > + * increasing distance. Again, this definitely not a redundancy removal. This is a description of new behavior, and should go with the implementation of that behavior. > */ > cpu = scx_pick_idle_cpu(p->cpus_ptr, node, 0); > if (cpu >= 0) > -- > 2.47.1