Re: [PATCH 09/10] sched_ext: idle: Get rid of the scx_selcpu_topo_numa logic

Yury Norov <yury.norov@xxxxxxxxx> · Mon, 23 Dec 2024 15:39:56 -0800

On Fri, Dec 20, 2024 at 04:11:41PM +0100, Andrea Righi wrote:
> With the introduction of separate per-NUMA node cpumasks, we
> automatically track idle CPUs within each NUMA node.
> 
> This makes the special logic for determining idle CPUs in each NUMA node
> redundant and unnecessary, so we can get rid of it.

But it looks like you do more than that... 

> Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx>
> ---
>  kernel/sched/ext_idle.c | 93 ++++++++++-------------------------------
>  1 file changed, 23 insertions(+), 70 deletions(-)
> 
> diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
> index 013deaa08f12..b36e93da1b75 100644
> --- a/kernel/sched/ext_idle.c
> +++ b/kernel/sched/ext_idle.c
> @@ -82,7 +82,6 @@ static void idle_masks_init(void)
>  }
>  
>  static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
> -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
>  
>  /*
>   * Return the node id associated to a target idle CPU (used to determine
> @@ -259,25 +258,6 @@ static unsigned int numa_weight(s32 cpu)
>  	return sg->group_weight;
>  }
>  
> -/*
> - * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
> - * domain is not defined).
> - */
> -static struct cpumask *numa_span(s32 cpu)
> -{
> -	struct sched_domain *sd;
> -	struct sched_group *sg;
> -
> -	sd = rcu_dereference(per_cpu(sd_numa, cpu));
> -	if (!sd)
> -		return NULL;
> -	sg = sd->groups;
> -	if (!sg)
> -		return NULL;
> -
> -	return sched_group_span(sg);

I didn't find llc_span() and node_span() in vanilla kernel. Does this series
have prerequisites?

> -}
> -
>  /*
>   * Return true if the LLC domains do not perfectly overlap with the NUMA
>   * domains, false otherwise.
> @@ -329,7 +309,7 @@ static bool llc_numa_mismatch(void)
>   */
>  static void update_selcpu_topology(struct sched_ext_ops *ops)
>  {
> -	bool enable_llc = false, enable_numa = false;
> +	bool enable_llc = false;
>  	unsigned int nr_cpus;
>  	s32 cpu = cpumask_first(cpu_online_mask);
>  
> @@ -348,41 +328,34 @@ static void update_selcpu_topology(struct sched_ext_ops *ops)
>  	if (nr_cpus > 0) {
>  		if (nr_cpus < num_online_cpus())
>  			enable_llc = true;
> +		/*
> +		 * No need to enable LLC optimization if the LLC domains are
> +		 * perfectly overlapping with the NUMA domains when per-node
> +		 * cpumasks are enabled.
> +		 */
> +		if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
> +		    !llc_numa_mismatch())
> +			enable_llc = false;

This doesn't sound like redundancy removal. I may be wrong, but this
looks like a sort of optimization. If so, it deserves to be a separate
patch.

>  		pr_debug("sched_ext: LLC=%*pb weight=%u\n",
>  			 cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
>  	}
> -
> -	/*
> -	 * Enable NUMA optimization only when there are multiple NUMA domains
> -	 * among the online CPUs and the NUMA domains don't perfectly overlaps
> -	 * with the LLC domains.
> -	 *
> -	 * If all CPUs belong to the same NUMA node and the same LLC domain,
> -	 * enabling both NUMA and LLC optimizations is unnecessary, as checking
> -	 * for an idle CPU in the same domain twice is redundant.
> -	 */
> -	nr_cpus = numa_weight(cpu);

Neither I found numa_weight()...

> -	if (nr_cpus > 0) {
> -		if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
> -			enable_numa = true;
> -		pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
> -			 cpumask_pr_args(numa_span(cpu)), numa_weight(cpu));
> -	}

This calls numa_weight() twice. Get rid of it for good.

>  	rcu_read_unlock();
>  
>  	pr_debug("sched_ext: LLC idle selection %s\n",
>  		 enable_llc ? "enabled" : "disabled");
> -	pr_debug("sched_ext: NUMA idle selection %s\n",
> -		 enable_numa ? "enabled" : "disabled");
>  
>  	if (enable_llc)
>  		static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
>  	else
>  		static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
> -	if (enable_numa)
> -		static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
> +
> +	/*
> +	 * Check if we need to enable per-node cpumasks.
> +	 */
> +	if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
> +		static_branch_enable_cpuslocked(&scx_builtin_idle_per_node);

This is the key from the whole series!

>  	else
> -		static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
> +		static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
>  }

This knob enables the whole new machinery, and it definitely deserves to
be a separate, very last patch. Now it looks like a sneaky replacement of
scx_selcpu_topo_numa with scx_builtin_idle_per_node, and this is wrong. 

Are you sure you need a comment on top of it? To me, the code is quite
self-explaining...

>  
>  /*
> @@ -405,9 +378,8 @@ static void update_selcpu_topology(struct sched_ext_ops *ops)
>   *
>   * 5. Pick any idle CPU usable by the task.
>   *
> - * Step 3 and 4 are performed only if the system has, respectively, multiple
> - * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
> - * scx_selcpu_topo_numa).
> + * Step 3 is performed only if the system has multiple LLC domains that are not
> + * perfectly overlapping with the NUMA domains (see scx_selcpu_topo_llc).
>   *
>   * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
>   * we never call ops.select_cpu() for them, see select_task_rq().
> @@ -416,7 +388,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
>  			      u64 wake_flags, bool *found)
>  {
>  	const struct cpumask *llc_cpus = NULL;
> -	const struct cpumask *numa_cpus = NULL;
>  	int node = idle_cpu_to_node(prev_cpu);
>  	s32 cpu;
>  
> @@ -438,13 +409,9 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
>  	 * CPU affinity), the task will simply use the flat scheduling domain
>  	 * defined by user-space.
>  	 */
> -	if (p->nr_cpus_allowed >= num_possible_cpus()) {
> -		if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
> -			numa_cpus = numa_span(prev_cpu);
> -
> +	if (p->nr_cpus_allowed >= num_possible_cpus())
>  		if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
>  			llc_cpus = llc_span(prev_cpu);
> -	}

I'd keep the curve braces. That would minimize your patch and preserve
more history.

>  
>  	/*
>  	 * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
> @@ -507,15 +474,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
>  				goto cpu_found;
>  		}
>  
> -		/*
> -		 * Search for any fully idle core in the same NUMA node.
> -		 */
> -		if (numa_cpus) {
> -			cpu = scx_pick_idle_cpu(numa_cpus, node, SCX_PICK_IDLE_CORE);
> -			if (cpu >= 0)
> -				goto cpu_found;
> -		}
> -
>  		/*
>  		 * Search for any full idle core usable by the task.
>  		 *
> @@ -545,17 +503,12 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
>  			goto cpu_found;
>  	}
>  
> -	/*
> -	 * Search for any idle CPU in the same NUMA node.
> -	 */
> -	if (numa_cpus) {
> -		cpu = pick_idle_cpu_from_node(numa_cpus, node, 0);
> -		if (cpu >= 0)
> -			goto cpu_found;
> -	}
> -
>  	/*
>  	 * Search for any idle CPU usable by the task.
> +	 *
> +	 * If NUMA aware idle selection is enabled, the search will begin
> +	 * in prev_cpu's node and proceed to other nodes in order of
> +	 * increasing distance.

Again, this definitely not a redundancy removal. This is a description
of new behavior, and should go with the implementation of that
behavior.

>  	 */
>  	cpu = scx_pick_idle_cpu(p->cpus_ptr, node, 0);
>  	if (cpu >= 0)
> -- 
> 2.47.1