Re:[RFC PATCH] cgroup: support numabalancing disable in cgroup level

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Seems the mail list reject mails from hotmail... Add cgroups mail list.

Honglei

At 2021-09-22 20:03:47, "Honglei Wang" <jameshongleiwang@xxxxxxxxxxx> wrote:
>We have more and more containers run on same host in cloudnative
>environment today. While dealing with performance problems, I found
>some containers get benefit from the numabalancing, but others get
>negative influence. I'm thinking it would be better if we can support
>open/close this feature in cgroup level.
>
>Basically speaking, the idea is that the numabalancing for one task will
>only happen when sched_numa_balancing is set ON and it's not disabled in
>cgroup which the task is attached.
>
>'cgroup.numabalancing_disable' is introduced for none-root cgroup to
>provide an interface to disable the numabalancing for the cgroup and its
>descendants. And I add an item 'numabalancing_disable' in cgroup.events
>to show the actual numabalancing_disable state.
>
>We just provide option to disable the numabalancing for specific cgroup.
>The main purpose is to prevent the balancing happen for the tasks
>attached to the cgroup when the sched_numa_balancing is set ON. The
>balancing won't happen anyway if the sched_numa_balancing is set OFF.
>
>The inheritance relationship is similar to freeze controller. I borrowed
>the logic of freezer when did coding and add 'numa_cgrp_disable' in the
>task_struct to track if the cgroup which the task's attached is
>numabalancing disabled.
>
>Signed-off-by: Honglei Wang <jameshongleiwang@xxxxxxxxxxx>
>---
> include/linux/cgroup-defs.h |  23 +++++++
> include/linux/sched.h       |   3 +
> kernel/cgroup/cgroup.c      | 145 ++++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/fair.c         |  12 +++-
> 4 files changed, 180 insertions(+), 3 deletions(-)
>
>diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
>index e1c705fdfa7c..e4e180e0adc6 100644
>--- a/include/linux/cgroup-defs.h
>+++ b/include/linux/cgroup-defs.h
>@@ -74,6 +74,11 @@ enum {
> 
> 	/* Control group has to be killed. */
> 	CGRP_KILL,
>+
>+#ifdef CONFIG_NUMA_BALANCING
>+	/* Control group set numabalaning disable. */
>+	CGRP_NUMABALANCING_DISABLE,
>+#endif
> };
> 
> /* cgroup_root->flags */
>@@ -357,6 +362,21 @@ struct cgroup_freezer_state {
> 	int nr_frozen_tasks;
> };
> 
>+#ifdef CONFIG_NUMA_BALANCING
>+struct cgroup_numabalancing_state {
>+	/* Should the cgroup and its descendants be changed state. */
>+	bool nb_disable;
>+
>+	/* Should the cgroup actually be set as numabalancing disable? */
>+	int e_nb_disable;
>+};
>+
>+#else /* CONFIG_NUMA_BALANCING */
>+
>+struct cgroup_numabalancing_state { };
>+
>+#endif /* CONFIG_NUMA_BALANCING */
>+
> struct cgroup {
> 	/* self css with NULL ->ss, points back to this cgroup */
> 	struct cgroup_subsys_state self;
>@@ -486,6 +506,9 @@ struct cgroup {
> 	/* Used to store internal freezer state */
> 	struct cgroup_freezer_state freezer;
> 
>+	/* Used to store numabalancing state */
>+	struct cgroup_numabalancing_state nb_state;
>+
> 	/* ids of the ancestors at each level including self */
> 	u64 ancestor_ids[];
> };
>diff --git a/include/linux/sched.h b/include/linux/sched.h
>index e12b524426b0..3375b9d8c5ce 100644
>--- a/include/linux/sched.h
>+++ b/include/linux/sched.h
>@@ -1246,6 +1246,9 @@ struct task_struct {
> 	u64				last_sum_exec_runtime;
> 	struct callback_head		numa_work;
> 
>+	/* Mark if numabalancing is disabled in cgroup level */
>+	int				numa_cgrp_disable;
>+
> 	/*
> 	 * This pointer is only modified for current in syscall and
> 	 * pagefault context (and for tasks being destroyed), so it can be read
>diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
>index 881ce1470beb..cba5edb7a9a7 100644
>--- a/kernel/cgroup/cgroup.c
>+++ b/kernel/cgroup/cgroup.c
>@@ -2470,6 +2470,13 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
> 			 */
> 			cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
> 						    to_cset->dfl_cgrp);
>+
>+#ifdef CONFIG_NUMA_BALANCING
>+			if (to_cset->dfl_cgrp->nb_state.nb_disable)
>+				WRITE_ONCE(task->numa_cgrp_disable, 1);
>+			else
>+				WRITE_ONCE(task->numa_cgrp_disable, 0);
>+#endif
> 			put_css_set_locked(from_cset);
> 
> 		}
>@@ -3533,6 +3540,11 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
> 	seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
> 	seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
> 
>+#ifdef CONFIG_NUMA_BALANCING
>+	seq_printf(seq, "numabalancing_disable %d\n",
>+		   test_bit(CGRP_NUMABALANCING_DISABLE, &cgrp->flags));
>+#endif
>+
> 	return 0;
> }
> 
>@@ -3675,6 +3687,118 @@ bool cgroup_psi_enabled(void)
> 
> #endif /* CONFIG_PSI */
> 
>+#ifdef CONFIG_NUMA_BALANCING
>+static void __cgroup_numabalancing_disable_set(struct cgroup *cgrp, bool nb_disable)
>+{
>+	struct css_task_iter it;
>+	struct task_struct *task;
>+
>+	lockdep_assert_held(&cgroup_mutex);
>+
>+	spin_lock_irq(&css_set_lock);
>+	if (nb_disable)
>+		set_bit(CGRP_NUMABALANCING_DISABLE, &cgrp->flags);
>+	else
>+		clear_bit(CGRP_NUMABALANCING_DISABLE, &cgrp->flags);
>+	spin_unlock_irq(&css_set_lock);
>+
>+	css_task_iter_start(&cgrp->self, 0, &it);
>+	while ((task = css_task_iter_next(&it))) {
>+		/*
>+		 * We don't care about NUMA placement if the task is exiting.
>+		 * And we don't NUMA balance for kthreads.
>+		 */
>+		if (task->flags & (PF_EXITING | PF_KTHREAD))
>+			continue;
>+		task->numa_cgrp_disable = nb_disable;
>+	}
>+	css_task_iter_end(&it);
>+}
>+
>+static void cgroup_numabalancing_disable_set(struct cgroup *cgrp, bool nb_disable)
>+{
>+	struct cgroup_subsys_state *css;
>+	struct cgroup *dsct;
>+
>+	lockdep_assert_held(&cgroup_mutex);
>+
>+	/*
>+	 * Nothing changed? Just exit.
>+	 */
>+	if (cgrp->nb_state.nb_disable == nb_disable)
>+		return;
>+
>+	cgrp->nb_state.nb_disable = nb_disable;
>+
>+	/*
>+	 * Propagate changes downwards the cgroup tree.
>+	 */
>+	css_for_each_descendant_pre(css, &cgrp->self) {
>+		dsct = css->cgroup;
>+
>+		if (cgroup_is_dead(dsct))
>+			continue;
>+
>+		if (nb_disable) {
>+			dsct->nb_state.e_nb_disable++;
>+			/*
>+			 * Already be set because of ancestor's settings?
>+			 */
>+			if (dsct->nb_state.e_nb_disable > 1)
>+				continue;
>+		} else {
>+			dsct->nb_state.e_nb_disable--;
>+			/*
>+			 * Still keep numabalancing disable because of ancestor's settings?
>+			 */
>+			if (dsct->nb_state.e_nb_disable > 0)
>+				continue;
>+
>+			WARN_ON_ONCE(dsct->nb_state.e_nb_disable < 0);
>+		}
>+
>+		/*
>+		 * Do change actual state: numabalancing disable or enable.
>+		 */
>+		__cgroup_numabalancing_disable_set(dsct, nb_disable);
>+	}
>+}
>+
>+static int cgroup_numabalancing_disable_show(struct seq_file *seq, void *v)
>+{
>+	struct cgroup *cgrp = seq_css(seq)->cgroup;
>+
>+	seq_printf(seq, "%d\n", cgrp->nb_state.nb_disable);
>+
>+	return 0;
>+}
>+
>+static ssize_t cgroup_numabalancing_disable_write(struct kernfs_open_file *of,
>+				   char *buf, size_t nbytes, loff_t off)
>+{
>+	struct cgroup *cgrp;
>+	ssize_t ret;
>+	int nb_disable;
>+
>+	ret = kstrtoint(strstrip(buf), 0, &nb_disable);
>+	if (ret)
>+		return ret;
>+
>+	if (nb_disable < 0 || nb_disable > 1)
>+		return -ERANGE;
>+
>+	cgrp = cgroup_kn_lock_live(of->kn, false);
>+	if (!cgrp)
>+		return -ENOENT;
>+
>+	cgroup_numabalancing_disable_set(cgrp, nb_disable);
>+
>+	cgroup_kn_unlock(of->kn);
>+
>+	return nbytes;
>+}
>+#endif /* CONFIG_NUMA_BALANCING */
>+
> static int cgroup_freeze_show(struct seq_file *seq, void *v)
> {
> 	struct cgroup *cgrp = seq_css(seq)->cgroup;
>@@ -5015,6 +5139,14 @@ static struct cftype cgroup_base_files[] = {
> 		.release = cgroup_pressure_release,
> 	},
> #endif /* CONFIG_PSI */
>+#ifdef CONFIG_NUMA_BALANCING
>+	{
>+		.name = "cgroup.numabalancing_disable",
>+		.flags = CFTYPE_NOT_ON_ROOT,
>+		.seq_show = cgroup_numabalancing_disable_show,
>+		.write = cgroup_numabalancing_disable_write,
>+	},
>+#endif /* CONFIG_NUMA_BALANCING */
> 	{ }	/* terminate */
> };
> 
>@@ -5341,6 +5473,14 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
> 		set_bit(CGRP_FROZEN, &cgrp->flags);
> 	}
> 
>+#ifdef CONFIG_NUMA_BALANCING
>+	/*
>+	 * New cgroup inherits effective numabalancing disable counter.
>+	 */
>+	cgrp->nb_state.e_nb_disable = parent->nb_state.e_nb_disable;
>+	if (cgrp->nb_state.e_nb_disable)
>+		set_bit(CGRP_NUMABALANCING_DISABLE, &cgrp->flags);
>+#endif
> 	spin_lock_irq(&css_set_lock);
> 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
> 		cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
>@@ -6285,6 +6425,11 @@ void cgroup_post_fork(struct task_struct *child,
> 		 * userspace.
> 		 */
> 		kill = test_bit(CGRP_KILL, &cgrp_flags);
>+
>+#ifdef CONFIG_NUMA_BALANCING
>+		if (test_bit(CGRP_NUMABALANCING_DISABLE, &cgrp_flags))
>+			child->numa_cgrp_disable = 1;
>+#endif
> 	}
> 
> 	spin_unlock_irq(&css_set_lock);
>diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>index ff69f245b939..70d862213f63 100644
>--- a/kernel/sched/fair.c
>+++ b/kernel/sched/fair.c
>@@ -2629,7 +2629,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
> 	struct numa_group *ng;
> 	int priv;
> 
>-	if (!static_branch_likely(&sched_numa_balancing))
>+	if (!static_branch_likely(&sched_numa_balancing) ||
>+		READ_ONCE(p->numa_cgrp_disable))
> 		return;
> 
> 	/* for example, ksmd faulting in a user's mm */
>@@ -2910,6 +2911,9 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
> 	if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
> 		return;
> 
>+	if (READ_ONCE(curr->numa_cgrp_disable))
>+		return;
>+
> 	/*
> 	 * Using runtime rather than walltime has the dual advantage that
> 	 * we (mostly) drive the selection from busy threads and that the
>@@ -2934,7 +2938,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
> 	int src_nid = cpu_to_node(task_cpu(p));
> 	int dst_nid = cpu_to_node(new_cpu);
> 
>-	if (!static_branch_likely(&sched_numa_balancing))
>+	if (!static_branch_likely(&sched_numa_balancing) ||
>+		READ_ONCE(p->numa_cgrp_disable))
> 		return;
> 
> 	if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
>@@ -7727,7 +7732,8 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> 	unsigned long src_weight, dst_weight;
> 	int src_nid, dst_nid, dist;
> 
>-	if (!static_branch_likely(&sched_numa_balancing))
>+	if (!static_branch_likely(&sched_numa_balancing) ||
>+		READ_ONCE(p->numa_cgrp_disable))
> 		return -1;
> 
> 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
>-- 
>2.14.1




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]     [Monitors]

  Powered by Linux