Many scx schedulers implement their own hard or soft-affinity rules to support topology characteristics, such as heterogeneous architectures (e.g., big.LITTLE, P-cores/E-cores), or to categorize tasks based on specific properties (e.g., running certain tasks only in a subset of CPUs). Currently, there is no mechanism that allows to use the built-in idle CPU selection policy to an arbitrary subset of CPUs. As a result, schedulers often implement their own idle CPU selection policies, which are typically similar to one another, leading to a lot of code duplication. To address this, modify scx_select_cpu_dfl() to accept an arbitrary cpumask, that can be used by the BPF schedulers to apply the existent built-in idle CPU selection policy to a subset of allowed CPUs. With this concept the idle CPU selection policy becomes the following: - always prioritize CPUs from fully idle SMT cores (if SMT is enabled), - select the same CPU if it's idle and in the allowed CPUs, - select an idle CPU within the same LLC, if the LLC cpumask is a subset of the allowed CPUs, - select an idle CPU within the same node, if the node cpumask is a subset of the allowed CPUs, - select an idle CPU within the allowed CPUs. This functionality will be exposed through a dedicated kfunc in a separate patch. Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx> --- kernel/sched/ext_idle.c | 96 ++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 26 deletions(-) diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index a90d85bce1ccb..a9755434e88b7 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -49,6 +49,7 @@ static struct scx_idle_cpus **scx_idle_node_masks; /* * Local per-CPU cpumasks (used to generate temporary idle cpumasks). */ +static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask); static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask); static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask); @@ -397,15 +398,18 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); } -static inline bool task_allowed_all_cpus(const struct task_struct *p) +/* + * Return true if @p can run on all possible CPUs, false otherwise. + */ +static inline bool task_affinity_all(const struct task_struct *p) { return p->nr_cpus_allowed >= num_possible_cpus(); } /* * Return the subset of @cpus that task @p can use, according to - * @cpus_allowed, or NULL if none of the CPUs in the @cpus cpumask can be - * used. + * @cpus_allowed, or NULL if none of the CPUs in the target cpumask @cpus + * can be used. */ static const struct cpumask *task_cpumask(const struct task_struct *p, const struct cpumask *cpus_allowed, @@ -414,14 +418,20 @@ static const struct cpumask *task_cpumask(const struct task_struct *p, { /* * If the task is allowed to run on all CPUs, simply use the - * architecture's cpumask directly. Otherwise, compute the - * intersection of the architecture's cpumask and the task's - * allowed cpumask. + * target cpumask directly (@cpus). Otherwise, compute the + * intersection of the target cpumask and the task's allowed + * cpumask. */ - if (!cpus || task_allowed_all_cpus(p) || cpumask_subset(cpus, cpus_allowed)) + if (!cpus || ((cpus_allowed == p->cpus_ptr) && task_affinity_all(p)) || + cpumask_subset(cpus, cpus_allowed)) return cpus; - if (cpumask_and(local_cpus, cpus, cpus_allowed)) + /* + * Compute the intersection and return NULL if the result is empty + * or if it perfectly overlaps with the subset of allowed CPUs. + */ + if (cpumask_and(local_cpus, cpus, cpus_allowed) && + !cpumask_equal(local_cpus, cpus_allowed)) return local_cpus; return NULL; @@ -439,13 +449,15 @@ static const struct cpumask *task_cpumask(const struct task_struct *p, * branch prediction optimizations. * * 3. Pick a CPU within the same LLC (Last-Level Cache): - * - if the above conditions aren't met, pick a CPU that shares the same LLC - * to maintain cache locality. + * - if the above conditions aren't met, pick a CPU that shares the same + * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain + * cache locality. * * 4. Pick a CPU within the same NUMA node, if enabled: - * - choose a CPU from the same NUMA node to reduce memory access latency. + * - choose a CPU from the same NUMA node, if the node cpumask is a + * subset of @cpus_allowed, to reduce memory access latency. * - * 5. Pick any idle CPU usable by the task. + * 5. Pick any idle CPU within the @cpus_allowed domain. * * Step 3 and 4 are performed only if the system has, respectively, * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and @@ -464,9 +476,43 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *cpus_allowed, u64 flags) { const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL; - int node = scx_cpu_node_if_enabled(prev_cpu); + const struct cpumask *allowed = p->cpus_ptr; + int node; s32 cpu; + preempt_disable(); + + /* + * Determine the subset of CPUs usable by @p within @cpus_allowed. + */ + if (cpus_allowed != p->cpus_ptr) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask); + + if (task_affinity_all(p) || cpumask_subset(cpus_allowed, p->cpus_ptr)) { + allowed = cpus_allowed; + } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) { + allowed = local_cpus; + } else { + cpu = -EBUSY; + goto out_enable; + } + } + + /* + * If @prev_cpu is not in the allowed domain, try to assign a new + * arbitrary CPU usable by the task in the allowed domain. + */ + if (!cpumask_test_cpu(prev_cpu, allowed)) { + cpu = cpumask_any_and_distribute(p->cpus_ptr, allowed); + if (cpu < nr_cpu_ids) { + prev_cpu = cpu; + } else { + cpu = -EBUSY; + goto out_enable; + } + } + node = scx_cpu_node_if_enabled(prev_cpu); + /* * This is necessary to protect llc_cpus. */ @@ -476,19 +522,13 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * Determine the subset of CPUs that the task can use in its * current LLC and node. */ - if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) { - numa_cpus = task_cpumask(p, cpus_allowed, numa_span(prev_cpu), + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) + numa_cpus = task_cpumask(p, allowed, numa_span(prev_cpu), this_cpu_cpumask_var_ptr(local_numa_idle_cpumask)); - if (cpumask_equal(numa_cpus, cpus_allowed)) - numa_cpus = NULL; - } - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) { - llc_cpus = task_cpumask(p, cpus_allowed, llc_span(prev_cpu), + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) + llc_cpus = task_cpumask(p, allowed, llc_span(prev_cpu), this_cpu_cpumask_var_ptr(local_llc_idle_cpumask)); - if (cpumask_equal(llc_cpus, cpus_allowed)) - llc_cpus = NULL; - } /* * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. @@ -525,7 +565,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, cpu_rq(cpu)->scx.local_dsq.nr == 0 && (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && !cpumask_empty(idle_cpumask(waker_node)->cpu)) { - if (cpumask_test_cpu(cpu, cpus_allowed)) + if (cpumask_test_cpu(cpu, allowed)) goto out_unlock; } } @@ -570,7 +610,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * begin in prev_cpu's node and proceed to other nodes in * order of increasing distance. */ - cpu = scx_pick_idle_cpu(cpus_allowed, node, flags | SCX_PICK_IDLE_CORE); + cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE); if (cpu >= 0) goto out_unlock; @@ -618,12 +658,14 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * in prev_cpu's node and proceed to other nodes in order of * increasing distance. */ - cpu = scx_pick_idle_cpu(cpus_allowed, node, flags); + cpu = scx_pick_idle_cpu(allowed, node, flags); if (cpu >= 0) goto out_unlock; out_unlock: rcu_read_unlock(); +out_enable: + preempt_enable(); return cpu; } @@ -655,6 +697,8 @@ void scx_idle_init_masks(void) /* Allocate local per-cpu idle cpumasks */ for_each_possible_cpu(i) { + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i), GFP_KERNEL, cpu_to_node(i))); BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i), -- 2.48.1