On Thu, Jun 01, 2023 at 04:47:06PM +0200, Peter Zijlstra wrote: > One way to fix all this would be by having arch/x86/kernel/smpboot.c set > an AMD specific set_sched_topology() that has a CCD domain above the MC > and below the DIE domain that groups 'near' CCDs together based on some > AMD specific topology information. > > Then for small systems that will probably be just a single CCD domain > and the degenerate code will make it go away, but for these large > systems it will do what is right for their respective configuration. > > Then, since this new multi-llc code uses MC->parent it will end up on > the fancy new CCD domain and not scan the *entire* socket. > > Hmm? Something like the (untested) below might be a nice base to go from. Then all you have to do is add something like: if (x86_has_ccd_topology) { x86_topology[i++] = (struct sched_domain_topology_level){ cpu_ccd_mask, SD_INIT_NAME(CCD) }; } (and construct cpu_ccd_mask obviously...) --- arch/x86/kernel/smpboot.c | 94 ++++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 51 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 34066f6735dd..0a22d719b6b6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -563,50 +563,57 @@ static int x86_cluster_flags(void) #endif #endif -static struct sched_domain_topology_level x86_numa_in_package_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, -#endif - { NULL, }, -}; +/* + * Set if a package/die has multiple NUMA nodes inside. + * AMD Magny-Cours, Intel Cluster-on-Die, and Intel + * Sub-NUMA Clustering have this. + */ +static bool x86_has_numa_in_package; -static struct sched_domain_topology_level x86_hybrid_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, -#endif - { cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(DIE) }, - { NULL, }, -}; +static struct sched_domain_topology_level x86_topology[6]; + +static void __init build_sched_topology(void) +{ + int i = 0; -static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) + }; #endif #ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, + /* + * For now, skip the cluster domain on Hybrid. + */ + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) + }; + } #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) + }; #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; + /* + * When there is NUMA topology inside the package skip the DIE domain + * since the NUMA domains will auto-magically create the right spanning + * domains based on the SLIT. + */ + if (!x86_has_numa_in_package) { + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_cpu_mask, SD_INIT_NAME(DIE) + }; + } -/* - * Set if a package/die has multiple NUMA nodes inside. - * AMD Magny-Cours, Intel Cluster-on-Die, and Intel - * Sub-NUMA Clustering have this. - */ -static bool x86_has_numa_in_package; + /* + * There must be one trailing NULL entry left. + */ + BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); + + set_sched_topology(x86_topology); +} void set_cpu_sibling_map(int cpu) { @@ -1390,15 +1397,6 @@ void __init smp_prepare_cpus_common(void) zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); } - /* - * Set 'default' x86 topology, this matches default_topology() in that - * it has NUMA nodes as a topology level. See also - * native_smp_cpus_done(). - * - * Must be done before set_cpus_sibling_map() is ran. - */ - set_sched_topology(x86_topology); - set_cpu_sibling_map(0); } @@ -1490,13 +1488,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) pr_debug("Boot done\n"); calculate_max_logical_packages(); - - /* XXX for now assume numa-in-package and hybrid don't overlap */ - if (x86_has_numa_in_package) - set_sched_topology(x86_numa_in_package_topology); - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - set_sched_topology(x86_hybrid_topology); - + build_sched_topology(); nmi_selftest(); impress_friends(); cache_aps_init();