On Wed, Feb 12, 2025 at 05:48:13PM +0100, Andrea Righi wrote: > @@ -90,6 +131,78 @@ s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) > goto retry; > } > > +static s32 pick_idle_cpu_from_other_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) 'From other node' sounds a bit vague > +{ > + static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited); > + nodemask_t *unvisited = this_cpu_ptr(&per_cpu_unvisited); > + s32 cpu = -EBUSY; > + > + preempt_disable(); > + unvisited = this_cpu_ptr(&per_cpu_unvisited); > + > + /* > + * Restrict the search to the online nodes, excluding the current > + * one. > + */ > + nodes_clear(*unvisited); > + nodes_or(*unvisited, *unvisited, node_states[N_ONLINE]); nodes_clear() + nodes_or() == nodes_copy() Yeah, we miss it. The attached patch adds nodes_copy(). Can you consider taking it for your series? > + node_clear(node, *unvisited); > + > + /* > + * Traverse all nodes in order of increasing distance, starting > + * from @node. > + * > + * This loop is O(N^2), with N being the amount of NUMA nodes, > + * which might be quite expensive in large NUMA systems. However, > + * this complexity comes into play only when a scheduler enables > + * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU > + * without specifying a target NUMA node, so it shouldn't be a > + * bottleneck is most cases. > + * > + * As a future optimization we may want to cache the list of nodes > + * in a per-node array, instead of actually traversing them every > + * time. > + */ > + for_each_node_numadist(node, *unvisited) { > + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); > + if (cpu >= 0) > + break; > + } > + preempt_enable(); > + > + return cpu; > +} > + > +/* > + * Find an idle CPU in the system, starting from @node. > + */ > +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) > +{ > + s32 cpu; > + > + /* > + * Always search in the starting node first (this is an > + * optimization that can save some cycles even when the search is > + * not limited to a single node). > + */ > + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); > + if (cpu >= 0) > + return cpu; > + > + /* > + * Stop the search if we are using only a single global cpumask > + * (NUMA_NO_NODE) or if the search is restricted to the first node > + * only. > + */ > + if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE) > + return -EBUSY; > + > + /* > + * Extend the search to the other nodes. > + */ > + return pick_idle_cpu_from_other_nodes(cpus_allowed, node, flags); > +}