[RFC PATCH 6/6] cpuset: Add cpuset.isolation_mask file

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Add a new cpuset.isolation_mask file in order to be able to modify the
housekeeping cpumask for each individual isolation feature on runtime.
In the future this will include nohz_full, unbound timers,
unbound workqueues, unbound kthreads, managed irqs, etc...

Start with supporting domain exclusion and CPUs passed through
"isolcpus=".

The cpuset.isolation_mask defaults to 0. Setting it to 1 will exclude
the given cpuset from the domains (they will be attached to NULL domain).
As long as a CPU is part of any cpuset with cpuset.isolation_mask set to
1, it will remain isolated even if it overlaps with another cpuset that
has cpuset.isolation_mask  set to 0. The same applies to parent and
subdirectories.

If a cpuset is a subset of "isolcpus=", it automatically maps it and
cpuset.isolation_mask will be set to 1. This subset is then cleared from
the initial "isolcpus=" mask. The user is then free to override
cpuset.isolation_mask to 0 in order to revert the effect of "isolcpus=".

Here is an example of use where the CPU 7 has been isolated on boot and
get re-attached to domains later from cpuset:

	$ cat /proc/cmdline
		isolcpus=7
	$ cd /sys/fs/cgroup/cpuset
	$ mkdir cpu7
	$ cd cpu7
	$ cat cpuset.cpus
		0-7
	$ cat cpuset.isolation_mask
		0
	$ ls /sys/kernel/debug/domains/cpu7	# empty because isolcpus=7
	$ echo 7 > cpuset.cpus
	$ cat cpuset.isolation_mask	# isolcpus subset automatically mapped
		1
	$ echo 0 > cpuset.isolation_mask
	$ ls /sys/kernel/debug/domains/cpu7/
		domain0  domain1

CHECKME: Should we have individual cpuset.isolation.$feature files for
         each isolation feature instead of a single mask file?

CHECKME: The scheduler is unhappy when _every_ CPUs are isolated

Signed-off-by: Frederic Weisbecker <frederic@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxxxxx>
Cc: Marcelo Tosatti <mtosatti@xxxxxxxxxx>
Cc: Nitesh Lal <nilal@xxxxxxxxxx>
Cc: Nicolas Saenz <nsaenzju@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Christoph Lameter <cl@xxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: Zefan Li <lizefan.x@xxxxxxxxxxxxx>
Cc: Alex Belits <abelits@xxxxxxxxxxx>
---
 kernel/cgroup/cpuset.c | 111 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 107 insertions(+), 4 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index adb5190c4429..ecb63be04408 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -82,6 +82,7 @@ struct cpuset {
 	struct cgroup_subsys_state css;
 
 	unsigned long flags;		/* "unsigned long" so bitops work */
+	unsigned long isol_flags;
 
 	/*
 	 * On default hierarchy:
@@ -258,6 +259,17 @@ static inline int is_spread_slab(const struct cpuset *cs)
 	return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }
 
+/* bits in struct cpuset flags field */
+typedef enum {
+	CS_ISOL_DOMAIN,
+	CS_ISOL_MAX
+} isol_flagbits_t;
+
+static inline int is_isol_domain(const struct cpuset *cs)
+{
+	return test_bit(CS_ISOL_DOMAIN, &cs->isol_flags);
+}
+
 static inline int is_partition_root(const struct cpuset *cs)
 {
 	return cs->partition_root_state > 0;
@@ -269,6 +281,13 @@ static struct cpuset top_cpuset = {
 	.partition_root_state = PRS_ENABLED,
 };
 
+/*
+ * CPUs passed through "isolcpus=" on boot, waiting to be mounted
+ * as soon as we meet a cpuset directory whose cpus_allowed is a
+ * subset of "isolcpus="
+ */
+static cpumask_var_t unmounted_isolcpus_mask;
+
 /**
  * cpuset_for_each_child - traverse online children of a cpuset
  * @child_cs: loop cursor pointing to the current child
@@ -681,6 +700,39 @@ static inline int nr_cpusets(void)
 	return static_key_count(&cpusets_enabled_key.key) + 1;
 }
 
+static int update_domain_housekeeping_mask(void)
+{
+	struct cpuset *cp;	/* top-down scan of cpusets */
+	struct cgroup_subsys_state *pos_css;
+	cpumask_var_t domain_mask;
+
+	if (!zalloc_cpumask_var(&domain_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_andnot(domain_mask, cpu_possible_mask, unmounted_isolcpus_mask);
+
+	rcu_read_lock();
+	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+		if (is_isol_domain(cp))
+			cpumask_andnot(domain_mask, domain_mask, cp->cpus_allowed);
+
+		if (cpumask_subset(cp->cpus_allowed, unmounted_isolcpus_mask)) {
+			unsigned long flags;
+			cpumask_andnot(unmounted_isolcpus_mask, unmounted_isolcpus_mask,
+				       cp->cpus_allowed);
+			spin_lock_irqsave(&callback_lock, flags);
+			cp->isol_flags |= BIT(CS_ISOL_DOMAIN);
+			spin_unlock_irqrestore(&callback_lock, flags);
+		}
+	}
+	rcu_read_unlock();
+
+	housekeeping_cpumask_set(domain_mask, HK_FLAG_DOMAIN);
+	free_cpumask_var(domain_mask);
+
+	return 0;
+}
+
 /*
  * generate_sched_domains()
  *
@@ -741,6 +793,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	struct cpuset **csa;	/* array of all cpuset ptrs */
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
+	int err;
 	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
 	int ndoms = 0;		/* number of sched domains in result */
@@ -752,6 +805,10 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	dattr = NULL;
 	csa = NULL;
 
+	err = update_domain_housekeeping_mask();
+	if (err < 0)
+		pr_err("Can't update housekeeping cpumask\n");
+
 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
 		ndoms = 1;
@@ -1449,7 +1506,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
 		 * root as well.
 		 */
 		if (!cpumask_empty(cp->cpus_allowed) &&
-		    is_sched_load_balance(cp) &&
+		    (is_sched_load_balance(cp) || is_isol_domain(cs)) &&
 		   (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
 		    is_partition_root(cp)))
 			need_rebuild_sched_domains = true;
@@ -1935,6 +1992,30 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	return err;
 }
 
+/*
+ * update_isol_flags - read a 0 or a 1 in a file and update associated isol flag
+ * mask:	the new mask value to apply (see isol_flagbits_t)
+ * cs:		the cpuset to update
+ *
+ * Call with cpuset_mutex held.
+ */
+static int update_isol_flags(struct cpuset *cs, u64 mask)
+{
+	unsigned long old_mask = cs->isol_flags;
+
+	if (mask & ~(BIT_ULL(CS_ISOL_MAX) - 1))
+		return -EINVAL;
+
+	spin_lock_irq(&callback_lock);
+	cs->isol_flags = (unsigned long)mask;
+	spin_unlock_irq(&callback_lock);
+
+	if (mask ^ old_mask)
+		rebuild_sched_domains_locked();
+
+	return 0;
+}
+
 /*
  * update_prstate - update partititon_root_state
  * cs:	the cpuset to update
@@ -2273,6 +2354,9 @@ typedef enum {
 	FILE_MEMORY_PRESSURE,
 	FILE_SPREAD_PAGE,
 	FILE_SPREAD_SLAB,
+//CHECKME: should we have individual cpuset.isolation.$feature files
+//instead of a mask of features in a single file?
+	FILE_ISOLATION_MASK,
 } cpuset_filetype_t;
 
 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -2314,6 +2398,9 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	case FILE_SPREAD_SLAB:
 		retval = update_flag(CS_SPREAD_SLAB, cs, val);
 		break;
+	case FILE_ISOLATION_MASK:
+		retval = update_isol_flags(cs, val);
+		break;
 	default:
 		retval = -EINVAL;
 		break;
@@ -2481,6 +2568,8 @@ static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
 		return is_spread_page(cs);
 	case FILE_SPREAD_SLAB:
 		return is_spread_slab(cs);
+	case FILE_ISOLATION_MASK:
+		return cs->isol_flags;
 	default:
 		BUG();
 	}
@@ -2658,6 +2747,13 @@ static struct cftype legacy_files[] = {
 		.private = FILE_MEMORY_PRESSURE_ENABLED,
 	},
 
+	{
+		.name = "isolation_mask",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_ISOLATION_MASK,
+	},
+
 	{ }	/* terminate */
 };
 
@@ -2834,9 +2930,12 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 	if (is_partition_root(cs))
 		update_prstate(cs, 0);
 
-	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-	    is_sched_load_balance(cs))
-		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+		if (is_sched_load_balance(cs))
+			update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+		if (is_isol_domain(cs))
+			update_isol_flags(cs, cs->isol_flags & ~BIT(CS_ISOL_DOMAIN));
+	}
 
 	if (cs->use_parent_ecpus) {
 		struct cpuset *parent = parent_cs(cs);
@@ -2873,6 +2972,9 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 		top_cpuset.mems_allowed = top_cpuset.effective_mems;
 	}
 
+	cpumask_andnot(unmounted_isolcpus_mask, cpu_possible_mask,
+		       housekeeping_cpumask(HK_FLAG_DOMAIN));
+
 	spin_unlock_irq(&callback_lock);
 	percpu_up_write(&cpuset_rwsem);
 }
@@ -2932,6 +3034,7 @@ int __init cpuset_init(void)
 	top_cpuset.relax_domain_level = -1;
 
 	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&unmounted_isolcpus_mask, GFP_KERNEL));
 
 	return 0;
 }
-- 
2.25.1




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]     [Monitors]

  Powered by Linux