On 01/12/20 02:59, Barry Song wrote: > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 1a68a05..ae8ec910 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -6106,6 +6106,37 @@ static inline int select_idle_smt(struct task_struct *p, int target) > > #endif /* CONFIG_SCHED_SMT */ > > +#ifdef CONFIG_SCHED_CLUSTER > +/* > + * Scan the local CLUSTER mask for idle CPUs. > + */ > +static int select_idle_cluster(struct task_struct *p, int target) > +{ > + int cpu; > + > + /* right now, no hardware with both cluster and smt to run */ > + if (sched_smt_active()) > + return -1; > + > + for_each_cpu_wrap(cpu, cpu_cluster_mask(target), target) { Gating this behind this new config only leveraged by arm64 doesn't make it very generic. Note that powerpc also has this newish "CACHE" level which seems to overlap in function with your "CLUSTER" one (both are arch specific, though). I think what you are after here is an SD_SHARE_PKG_RESOURCES domain walk, i.e. scan CPUs by increasing cache "distance". We already have it in some form, as we scan SMT & LLC domains; AFAICT LLC always maps to MC, except for said powerpc's CACHE thingie. *If* we are to generally support more levels with SD_SHARE_PKG_RESOURCES, we could say frob something into select_idle_cpu(). I'm thinking of something like the incomplete, untested below: --- diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ae7ceba8fd4f..70692888db00 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6120,7 +6120,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - struct sched_domain *this_sd; + struct sched_domain *this_sd, *child = NULL; u64 avg_cost, avg_idle; u64 time; int this = smp_processor_id(); @@ -6150,14 +6150,22 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = cpu_clock(this); - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + do { + /* XXX: sd should start as SMT's parent */ + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + if (child) + cpumask_andnot(cpus, cpus, sched_domain_span(child)); + + for_each_cpu_wrap(cpu, cpus, target) { + if (!--nr) + return -1; + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + break; + } - for_each_cpu_wrap(cpu, cpus, target) { - if (!--nr) - return -1; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - break; - } + child = sd; + sd = sd->parent; + } while (sd && sd->flags & SD_SHARE_PKG_RESOURCES); time = cpu_clock(this) - time; update_avg(&this_sd->avg_scan_cost, time);