Add a new cpuset.isolation_mask file in order to be able to modify the housekeeping cpumask for each individual isolation feature on runtime. In the future this will include nohz_full, unbound timers, unbound workqueues, unbound kthreads, managed irqs, etc... Start with supporting domain exclusion and CPUs passed through "isolcpus=". The cpuset.isolation_mask defaults to 0. Setting it to 1 will exclude the given cpuset from the domains (they will be attached to NULL domain). As long as a CPU is part of any cpuset with cpuset.isolation_mask set to 1, it will remain isolated even if it overlaps with another cpuset that has cpuset.isolation_mask set to 0. The same applies to parent and subdirectories. If a cpuset is a subset of "isolcpus=", it automatically maps it and cpuset.isolation_mask will be set to 1. This subset is then cleared from the initial "isolcpus=" mask. The user is then free to override cpuset.isolation_mask to 0 in order to revert the effect of "isolcpus=". Here is an example of use where the CPU 7 has been isolated on boot and get re-attached to domains later from cpuset: $ cat /proc/cmdline isolcpus=7 $ cd /sys/fs/cgroup/cpuset $ mkdir cpu7 $ cd cpu7 $ cat cpuset.cpus 0-7 $ cat cpuset.isolation_mask 0 $ ls /sys/kernel/debug/domains/cpu7 # empty because isolcpus=7 $ echo 7 > cpuset.cpus $ cat cpuset.isolation_mask # isolcpus subset automatically mapped 1 $ echo 0 > cpuset.isolation_mask $ ls /sys/kernel/debug/domains/cpu7/ domain0 domain1 CHECKME: Should we have individual cpuset.isolation.$feature files for each isolation feature instead of a single mask file? CHECKME: The scheduler is unhappy when _every_ CPUs are isolated Signed-off-by: Frederic Weisbecker <frederic@xxxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Juri Lelli <juri.lelli@xxxxxxxxxx> Cc: Marcelo Tosatti <mtosatti@xxxxxxxxxx> Cc: Nitesh Lal <nilal@xxxxxxxxxx> Cc: Nicolas Saenz <nsaenzju@xxxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Christoph Lameter <cl@xxxxxxxxx> Cc: Tejun Heo <tj@xxxxxxxxxx> Cc: Zefan Li <lizefan.x@xxxxxxxxxxxxx> Cc: Alex Belits <abelits@xxxxxxxxxxx> --- kernel/cgroup/cpuset.c | 111 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index adb5190c4429..ecb63be04408 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -82,6 +82,7 @@ struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ + unsigned long isol_flags; /* * On default hierarchy: @@ -258,6 +259,17 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } +/* bits in struct cpuset flags field */ +typedef enum { + CS_ISOL_DOMAIN, + CS_ISOL_MAX +} isol_flagbits_t; + +static inline int is_isol_domain(const struct cpuset *cs) +{ + return test_bit(CS_ISOL_DOMAIN, &cs->isol_flags); +} + static inline int is_partition_root(const struct cpuset *cs) { return cs->partition_root_state > 0; @@ -269,6 +281,13 @@ static struct cpuset top_cpuset = { .partition_root_state = PRS_ENABLED, }; +/* + * CPUs passed through "isolcpus=" on boot, waiting to be mounted + * as soon as we meet a cpuset directory whose cpus_allowed is a + * subset of "isolcpus=" + */ +static cpumask_var_t unmounted_isolcpus_mask; + /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child @@ -681,6 +700,39 @@ static inline int nr_cpusets(void) return static_key_count(&cpusets_enabled_key.key) + 1; } +static int update_domain_housekeeping_mask(void) +{ + struct cpuset *cp; /* top-down scan of cpusets */ + struct cgroup_subsys_state *pos_css; + cpumask_var_t domain_mask; + + if (!zalloc_cpumask_var(&domain_mask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_andnot(domain_mask, cpu_possible_mask, unmounted_isolcpus_mask); + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { + if (is_isol_domain(cp)) + cpumask_andnot(domain_mask, domain_mask, cp->cpus_allowed); + + if (cpumask_subset(cp->cpus_allowed, unmounted_isolcpus_mask)) { + unsigned long flags; + cpumask_andnot(unmounted_isolcpus_mask, unmounted_isolcpus_mask, + cp->cpus_allowed); + spin_lock_irqsave(&callback_lock, flags); + cp->isol_flags |= BIT(CS_ISOL_DOMAIN); + spin_unlock_irqrestore(&callback_lock, flags); + } + } + rcu_read_unlock(); + + housekeeping_cpumask_set(domain_mask, HK_FLAG_DOMAIN); + free_cpumask_var(domain_mask); + + return 0; +} + /* * generate_sched_domains() * @@ -741,6 +793,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ + int err; cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ @@ -752,6 +805,10 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = NULL; csa = NULL; + err = update_domain_housekeeping_mask(); + if (err < 0) + pr_err("Can't update housekeeping cpumask\n"); + /* Special case for the 99% of systems with one, full, sched domain */ if (root_load_balance && !top_cpuset.nr_subparts_cpus) { ndoms = 1; @@ -1449,7 +1506,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) * root as well. */ if (!cpumask_empty(cp->cpus_allowed) && - is_sched_load_balance(cp) && + (is_sched_load_balance(cp) || is_isol_domain(cs)) && (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || is_partition_root(cp))) need_rebuild_sched_domains = true; @@ -1935,6 +1992,30 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, return err; } +/* + * update_isol_flags - read a 0 or a 1 in a file and update associated isol flag + * mask: the new mask value to apply (see isol_flagbits_t) + * cs: the cpuset to update + * + * Call with cpuset_mutex held. + */ +static int update_isol_flags(struct cpuset *cs, u64 mask) +{ + unsigned long old_mask = cs->isol_flags; + + if (mask & ~(BIT_ULL(CS_ISOL_MAX) - 1)) + return -EINVAL; + + spin_lock_irq(&callback_lock); + cs->isol_flags = (unsigned long)mask; + spin_unlock_irq(&callback_lock); + + if (mask ^ old_mask) + rebuild_sched_domains_locked(); + + return 0; +} + /* * update_prstate - update partititon_root_state * cs: the cpuset to update @@ -2273,6 +2354,9 @@ typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, +//CHECKME: should we have individual cpuset.isolation.$feature files +//instead of a mask of features in a single file? + FILE_ISOLATION_MASK, } cpuset_filetype_t; static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, @@ -2314,6 +2398,9 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, case FILE_SPREAD_SLAB: retval = update_flag(CS_SPREAD_SLAB, cs, val); break; + case FILE_ISOLATION_MASK: + retval = update_isol_flags(cs, val); + break; default: retval = -EINVAL; break; @@ -2481,6 +2568,8 @@ static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) return is_spread_page(cs); case FILE_SPREAD_SLAB: return is_spread_slab(cs); + case FILE_ISOLATION_MASK: + return cs->isol_flags; default: BUG(); } @@ -2658,6 +2747,13 @@ static struct cftype legacy_files[] = { .private = FILE_MEMORY_PRESSURE_ENABLED, }, + { + .name = "isolation_mask", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_ISOLATION_MASK, + }, + { } /* terminate */ }; @@ -2834,9 +2930,12 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (is_partition_root(cs)) update_prstate(cs, 0); - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if (is_sched_load_balance(cs)) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + if (is_isol_domain(cs)) + update_isol_flags(cs, cs->isol_flags & ~BIT(CS_ISOL_DOMAIN)); + } if (cs->use_parent_ecpus) { struct cpuset *parent = parent_cs(cs); @@ -2873,6 +2972,9 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) top_cpuset.mems_allowed = top_cpuset.effective_mems; } + cpumask_andnot(unmounted_isolcpus_mask, cpu_possible_mask, + housekeeping_cpumask(HK_FLAG_DOMAIN)); + spin_unlock_irq(&callback_lock); percpu_up_write(&cpuset_rwsem); } @@ -2932,6 +3034,7 @@ int __init cpuset_init(void) top_cpuset.relax_domain_level = -1; BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&unmounted_isolcpus_mask, GFP_KERNEL)); return 0; } -- 2.25.1