> -----Original Message----- > From: Peter Zijlstra [mailto:peterz@xxxxxxxxxxxxx] > Sent: Tuesday, March 2, 2021 11:43 PM > To: Song Bao Hua (Barry Song) <song.bao.hua@xxxxxxxxxxxxx> > Cc: tim.c.chen@xxxxxxxxxxxxxxx; catalin.marinas@xxxxxxx; will@xxxxxxxxxx; > rjw@xxxxxxxxxxxxx; vincent.guittot@xxxxxxxxxx; bp@xxxxxxxxx; > tglx@xxxxxxxxxxxxx; mingo@xxxxxxxxxx; lenb@xxxxxxxxxx; > dietmar.eggemann@xxxxxxx; rostedt@xxxxxxxxxxx; bsegall@xxxxxxxxxx; > mgorman@xxxxxxx; msys.mizuma@xxxxxxxxx; valentin.schneider@xxxxxxx; > gregkh@xxxxxxxxxxxxxxxxxxx; Jonathan Cameron <jonathan.cameron@xxxxxxxxxx>; > juri.lelli@xxxxxxxxxx; mark.rutland@xxxxxxx; sudeep.holla@xxxxxxx; > aubrey.li@xxxxxxxxxxxxxxx; linux-arm-kernel@xxxxxxxxxxxxxxxxxxx; > linux-kernel@xxxxxxxxxxxxxxx; linux-acpi@xxxxxxxxxxxxxxx; x86@xxxxxxxxxx; > xuwei (O) <xuwei5@xxxxxxxxxx>; Zengtao (B) <prime.zeng@xxxxxxxxxxxxx>; > guodong.xu@xxxxxxxxxx; yangyicong <yangyicong@xxxxxxxxxx>; Liguozhu (Kenneth) > <liguozhu@xxxxxxxxxxxxx>; linuxarm@xxxxxxxxxxxxx; hpa@xxxxxxxxx > Subject: Re: [RFC PATCH v4 2/3] scheduler: add scheduler level for clusters > > On Tue, Mar 02, 2021 at 11:59:39AM +1300, Barry Song wrote: > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > > index 88a2e2b..d805e59 100644 > > --- a/kernel/sched/core.c > > +++ b/kernel/sched/core.c > > @@ -7797,6 +7797,16 @@ int sched_cpu_activate(unsigned int cpu) > > if (cpumask_weight(cpu_smt_mask(cpu)) == 2) > > static_branch_inc_cpuslocked(&sched_smt_present); > > #endif > > + > > +#ifdef CONFIG_SCHED_CLUSTER > > + /* > > + * When going up, increment the number of cluster cpus with > > + * cluster present. > > + */ > > + if (cpumask_weight(cpu_cluster_mask(cpu)) > 1) > > + static_branch_inc_cpuslocked(&sched_cluster_present); > > +#endif > > + > > set_cpu_active(cpu, true); > > > > if (sched_smp_initialized) { > > @@ -7873,6 +7883,14 @@ int sched_cpu_deactivate(unsigned int cpu) > > static_branch_dec_cpuslocked(&sched_smt_present); > > #endif > > > > +#ifdef CONFIG_SCHED_CLUSTER > > + /* > > + * When going down, decrement the number of cpus with cluster present. > > + */ > > + if (cpumask_weight(cpu_cluster_mask(cpu)) > 1) > > + static_branch_dec_cpuslocked(&sched_cluster_present); > > +#endif > > + > > if (!sched_smp_initialized) > > return 0; > > I don't think that's correct. IIUC this will mean the > sched_cluster_present thing will be enabled on anything with SMT (very > much including x86 big cores after the next patch). > > I'm thinking that at the very least you should check a CLS domain > exists, but that might be hard at this point, because the sched domains > haven't been build yet. might be able to achieve the same goal by: int cls_wt = cpumask_weight(cpu_cluster_mask(cpu)); if ((cls_wt > cpumask_weight(cpu_smt_mask(cpu))) && && (cls_wt < cpumask_weight(cpu_coregroup_mask(cpu)))) sched_cluster_present... > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > > index 8a8bd7b..3db7b07 100644 > > --- a/kernel/sched/fair.c > > +++ b/kernel/sched/fair.c > > @@ -6009,6 +6009,11 @@ static inline int __select_idle_cpu(int cpu) > > return -1; > > } > > > > +#ifdef CONFIG_SCHED_CLUSTER > > +DEFINE_STATIC_KEY_FALSE(sched_cluster_present); > > +EXPORT_SYMBOL_GPL(sched_cluster_present); > > I really rather think this shouldn't be exported Ok. Make sense. > > > +#endif > > + > > #ifdef CONFIG_SCHED_SMT > > DEFINE_STATIC_KEY_FALSE(sched_smt_present); > > EXPORT_SYMBOL_GPL(sched_smt_present); > > This is a KVM wart, it needs to know because mitigation crap. > Ok. > > @@ -6116,6 +6121,26 @@ static inline int select_idle_core(struct task_struct > *p, int core, struct cpuma > > > > #endif /* CONFIG_SCHED_SMT */ > > > > +static inline int _select_idle_cpu(bool smt, struct task_struct *p, int > target, struct cpumask *cpus, int *idle_cpu, int *nr) > > +{ > > + int cpu, i; > > + > > + for_each_cpu_wrap(cpu, cpus, target) { > > + if (smt) { > > + i = select_idle_core(p, cpu, cpus, idle_cpu); > > + } else { > > + if (!--*nr) > > + return -1; > > + i = __select_idle_cpu(cpu); > > + } > > + > > + if ((unsigned int)i < nr_cpumask_bits) > > + return i; > > + } > > + > > + return -1; > > +} > > + > > /* > > * Scan the LLC domain for idle CPUs; this is dynamically regulated by > > * comparing the average scan cost (tracked in sd->avg_scan_cost) against > the > > @@ -6124,7 +6149,7 @@ static inline int select_idle_core(struct task_struct > *p, int core, struct cpuma > > static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, > int target) > > { > > struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); > > - int i, cpu, idle_cpu = -1, nr = INT_MAX; > > + int i, idle_cpu = -1, nr = INT_MAX; > > bool smt = test_idle_cores(target, false); > > int this = smp_processor_id(); > > struct sched_domain *this_sd; > > @@ -6134,7 +6159,12 @@ static int select_idle_cpu(struct task_struct *p, > struct sched_domain *sd, int t > > if (!this_sd) > > return -1; > > > > - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); > > + if (!sched_cluster_active()) > > + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); > > +#ifdef CONFIG_SCHED_CLUSTER > > + if (sched_cluster_active()) > > + cpumask_and(cpus, cpu_cluster_mask(target), p->cpus_ptr); > > +#endif > > > > if (sched_feat(SIS_PROP) && !smt) { > > u64 avg_cost, avg_idle, span_avg; > > @@ -6155,24 +6185,32 @@ static int select_idle_cpu(struct task_struct *p, > struct sched_domain *sd, int t > > time = cpu_clock(this); > > } > > > > - for_each_cpu_wrap(cpu, cpus, target) { > > - if (smt) { > > - i = select_idle_core(p, cpu, cpus, &idle_cpu); > > - if ((unsigned int)i < nr_cpumask_bits) > > - return i; > > + /* scan cluster before scanning the whole llc */ > > +#ifdef CONFIG_SCHED_CLUSTER > > + if (sched_cluster_active()) { > > + i = _select_idle_cpu(smt, p, target, cpus, &idle_cpu, &nr); > > + if ((unsigned int) i < nr_cpumask_bits) { > > + idle_cpu = i; > > + goto done; > > + } else if (nr <= 0) > > + return -1; > > > > - } else { > > - if (!--nr) > > - return -1; > > - idle_cpu = __select_idle_cpu(cpu); > > - if ((unsigned int)idle_cpu < nr_cpumask_bits) > > - break; > > - } > > + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); > > + cpumask_andnot(cpus, cpus, cpu_cluster_mask(target)); > > } > > +#endif > > + > > + i = _select_idle_cpu(smt, p, target, cpus, &idle_cpu, &nr); > > + if ((unsigned int) i < nr_cpumask_bits) { > > + idle_cpu = i; > > + goto done; > > + } else if (nr <= 0) > > + return -1; > > > > if (smt) > > set_idle_cores(this, false); > > > > +done: > > if (sched_feat(SIS_PROP) && !smt) { > > time = cpu_clock(this) - time; > > update_avg(&this_sd->avg_scan_cost, time); > > And this is just horrific :-( I was actually quite struggling with this part. Had tried a couple of ways before sending this. Still the sent one was quite ugly. Thanks Barry