Ping - any feedback? Thanks! On Wed, Mar 10, 2021 at 11:02:33AM -0800, Alison Schofield wrote: > Commit 1340ccfa9a9a ("x86,sched: Allow topologies where NUMA nodes > share an LLC") added a vendor and model specific check to never > call topology_sane() for Intel Skylake Server systems where NUMA > nodes share an LLC. > > Intel Ice Lake and Sapphire Rapids CPUs also enumerate an LLC that is > shared by multiple NUMA nodes. The LLC on these CPUs is shared for > off-package data access but private to the NUMA node for on-package > access. Rather than managing a list of allowable SNC topologies, make > this SNC topology the default, and treat Intel's Cluster-On-Die (COD) > topology as the exception. > > In SNC mode, Sky Lake, Ice Lake, and Sapphire Rapids servers do not > emit this warning: > > sched: CPU #3's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency. > > Acked-by: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> > Suggested-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> > Signed-off-by: Alison Schofield <alison.schofield@xxxxxxxxx> > Cc: stable@xxxxxxxxxxxxxxx > Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> > Cc: Tony Luck <tony.luck@xxxxxxxxx> > Cc: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx> > Cc: "H. Peter Anvin" <hpa@xxxxxxxxxxxxxxx> > Cc: Borislav Petkov <bp@xxxxxxxxx> > Cc: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> > Cc: David Rientjes <rientjes@xxxxxxxxxx> > Cc: Igor Mammedov <imammedo@xxxxxxxxxx> > Cc: Prarit Bhargava <prarit@xxxxxxxxxx> > Cc: brice.goglin@xxxxxxxxx > --- > > Changes v2->v3: > - This is a v3 of this patch: https://lore.kernel.org/lkml/20210216195804.24204-1-alison.schofield@xxxxxxxxx/ > - Implemented PeterZ suggestion, and his code, to check for COD instead > of SNC. > - Updated commit message and log. > - Added 'Cc stable. > > Changes v1->v2: > - Implemented the minimal required change of adding the new models to > the existing vendor and model specific check. > > - Side effect of going minimalist: no longer labelled an X86_BUG (TonyL) > > - Considered PeterZ suggestion of checking for COD CPUs, rather than > SNC CPUs. That meant this snc_cpu list would go away, and so it never > needs updating. That ups the stakes for this patch wrt LOC changed > and testing needed. It actually drove me back to this simplest soln. > > - Considered DaveH suggestion to remove the check altogether and recognize > these topologies as sane. Not running with that further here. This patch > is what is needed now. The broader discussion of sane topologies can > carry on independent of this update. > > - Updated commit message and log. > > > arch/x86/kernel/smpboot.c | 90 ++++++++++++++++++++------------------- > 1 file changed, 46 insertions(+), 44 deletions(-) > > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c > index 02813a7f3a7c..147b2f3a2a09 100644 > --- a/arch/x86/kernel/smpboot.c > +++ b/arch/x86/kernel/smpboot.c > @@ -458,29 +458,52 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) > return false; > } > > +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) > +{ > + if (c->phys_proc_id == o->phys_proc_id && > + c->cpu_die_id == o->cpu_die_id) > + return true; > + return false; > +} > + > +/* > + * Unlike the other levels, we do not enforce keeping a > + * multicore group inside a NUMA node. If this happens, we will > + * discard the MC level of the topology later. > + */ > +static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) > +{ > + if (c->phys_proc_id == o->phys_proc_id) > + return true; > + return false; > +} > + > /* > - * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs. > + * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs. > * > - * These are Intel CPUs that enumerate an LLC that is shared by > - * multiple NUMA nodes. The LLC on these systems is shared for > - * off-package data access but private to the NUMA node (half > - * of the package) for on-package access. > + * Any Intel CPU that has multiple nodes per package and does not > + * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology. > * > - * CPUID (the source of the information about the LLC) can only > - * enumerate the cache as being shared *or* unshared, but not > - * this particular configuration. The CPU in this case enumerates > - * the cache to be shared across the entire package (spanning both > - * NUMA nodes). > + * When in SNC mode, these CPUs enumerate an LLC that is shared > + * by multiple NUMA nodes. The LLC is shared for off-package data > + * access but private to the NUMA node (half of the package) for > + * on-package access. CPUID (the source of the information about > + * the LLC) can only enumerate the cache as shared or unshared, > + * but not this particular configuration. > */ > > -static const struct x86_cpu_id snc_cpu[] = { > - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, NULL), > +static const struct x86_cpu_id intel_cod_cpu[] = { > + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */ > + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */ > + X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */ > {} > }; > > static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) > { > + const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu); > int cpu1 = c->cpu_index, cpu2 = o->cpu_index; > + bool intel_snc = id && id->driver_data; > > /* Do not match if we do not have a valid APICID for cpu: */ > if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID) > @@ -495,32 +518,12 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) > * means 'c' does not share the LLC of 'o'. This will be > * reflected to userspace. > */ > - if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu)) > + if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc) > return false; > > return topology_sane(c, o, "llc"); > } > > -/* > - * Unlike the other levels, we do not enforce keeping a > - * multicore group inside a NUMA node. If this happens, we will > - * discard the MC level of the topology later. > - */ > -static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) > -{ > - if (c->phys_proc_id == o->phys_proc_id) > - return true; > - return false; > -} > - > -static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) > -{ > - if ((c->phys_proc_id == o->phys_proc_id) && > - (c->cpu_die_id == o->cpu_die_id)) > - return true; > - return false; > -} > - > > #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) > static inline int x86_sched_itmt_flags(void) > @@ -592,14 +595,23 @@ void set_cpu_sibling_map(int cpu) > for_each_cpu(i, cpu_sibling_setup_mask) { > o = &cpu_data(i); > > + if (match_pkg(c, o) && !topology_same_node(c, o)) > + x86_has_numa_in_package = true; > + > if ((i == cpu) || (has_smt && match_smt(c, o))) > link_mask(topology_sibling_cpumask, cpu, i); > > if ((i == cpu) || (has_mp && match_llc(c, o))) > link_mask(cpu_llc_shared_mask, cpu, i); > > + if ((i == cpu) || (has_mp && match_die(c, o))) > + link_mask(topology_die_cpumask, cpu, i); > } > > + threads = cpumask_weight(topology_sibling_cpumask(cpu)); > + if (threads > __max_smt_threads) > + __max_smt_threads = threads; > + > /* > * This needs a separate iteration over the cpus because we rely on all > * topology_sibling_cpumask links to be set-up. > @@ -613,8 +625,7 @@ void set_cpu_sibling_map(int cpu) > /* > * Does this new cpu bringup a new core? > */ > - if (cpumask_weight( > - topology_sibling_cpumask(cpu)) == 1) { > + if (threads == 1) { > /* > * for each core in package, increment > * the booted_cores for this new cpu > @@ -631,16 +642,7 @@ void set_cpu_sibling_map(int cpu) > } else if (i != cpu && !c->booted_cores) > c->booted_cores = cpu_data(i).booted_cores; > } > - if (match_pkg(c, o) && !topology_same_node(c, o)) > - x86_has_numa_in_package = true; > - > - if ((i == cpu) || (has_mp && match_die(c, o))) > - link_mask(topology_die_cpumask, cpu, i); > } > - > - threads = cpumask_weight(topology_sibling_cpumask(cpu)); > - if (threads > __max_smt_threads) > - __max_smt_threads = threads; > } > > /* maps the cpu to the sched domain representing multi-core */ > -- > 2.20.1 >