Seems the mail list reject mails from hotmail... Add cgroups mail list. Honglei At 2021-09-22 20:03:47, "Honglei Wang" <jameshongleiwang@xxxxxxxxxxx> wrote: >We have more and more containers run on same host in cloudnative >environment today. While dealing with performance problems, I found >some containers get benefit from the numabalancing, but others get >negative influence. I'm thinking it would be better if we can support >open/close this feature in cgroup level. > >Basically speaking, the idea is that the numabalancing for one task will >only happen when sched_numa_balancing is set ON and it's not disabled in >cgroup which the task is attached. > >'cgroup.numabalancing_disable' is introduced for none-root cgroup to >provide an interface to disable the numabalancing for the cgroup and its >descendants. And I add an item 'numabalancing_disable' in cgroup.events >to show the actual numabalancing_disable state. > >We just provide option to disable the numabalancing for specific cgroup. >The main purpose is to prevent the balancing happen for the tasks >attached to the cgroup when the sched_numa_balancing is set ON. The >balancing won't happen anyway if the sched_numa_balancing is set OFF. > >The inheritance relationship is similar to freeze controller. I borrowed >the logic of freezer when did coding and add 'numa_cgrp_disable' in the >task_struct to track if the cgroup which the task's attached is >numabalancing disabled. > >Signed-off-by: Honglei Wang <jameshongleiwang@xxxxxxxxxxx> >--- > include/linux/cgroup-defs.h | 23 +++++++ > include/linux/sched.h | 3 + > kernel/cgroup/cgroup.c | 145 ++++++++++++++++++++++++++++++++++++++++++++ > kernel/sched/fair.c | 12 +++- > 4 files changed, 180 insertions(+), 3 deletions(-) > >diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h >index e1c705fdfa7c..e4e180e0adc6 100644 >--- a/include/linux/cgroup-defs.h >+++ b/include/linux/cgroup-defs.h >@@ -74,6 +74,11 @@ enum { > > /* Control group has to be killed. */ > CGRP_KILL, >+ >+#ifdef CONFIG_NUMA_BALANCING >+ /* Control group set numabalaning disable. */ >+ CGRP_NUMABALANCING_DISABLE, >+#endif > }; > > /* cgroup_root->flags */ >@@ -357,6 +362,21 @@ struct cgroup_freezer_state { > int nr_frozen_tasks; > }; > >+#ifdef CONFIG_NUMA_BALANCING >+struct cgroup_numabalancing_state { >+ /* Should the cgroup and its descendants be changed state. */ >+ bool nb_disable; >+ >+ /* Should the cgroup actually be set as numabalancing disable? */ >+ int e_nb_disable; >+}; >+ >+#else /* CONFIG_NUMA_BALANCING */ >+ >+struct cgroup_numabalancing_state { }; >+ >+#endif /* CONFIG_NUMA_BALANCING */ >+ > struct cgroup { > /* self css with NULL ->ss, points back to this cgroup */ > struct cgroup_subsys_state self; >@@ -486,6 +506,9 @@ struct cgroup { > /* Used to store internal freezer state */ > struct cgroup_freezer_state freezer; > >+ /* Used to store numabalancing state */ >+ struct cgroup_numabalancing_state nb_state; >+ > /* ids of the ancestors at each level including self */ > u64 ancestor_ids[]; > }; >diff --git a/include/linux/sched.h b/include/linux/sched.h >index e12b524426b0..3375b9d8c5ce 100644 >--- a/include/linux/sched.h >+++ b/include/linux/sched.h >@@ -1246,6 +1246,9 @@ struct task_struct { > u64 last_sum_exec_runtime; > struct callback_head numa_work; > >+ /* Mark if numabalancing is disabled in cgroup level */ >+ int numa_cgrp_disable; >+ > /* > * This pointer is only modified for current in syscall and > * pagefault context (and for tasks being destroyed), so it can be read >diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c >index 881ce1470beb..cba5edb7a9a7 100644 >--- a/kernel/cgroup/cgroup.c >+++ b/kernel/cgroup/cgroup.c >@@ -2470,6 +2470,13 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) > */ > cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, > to_cset->dfl_cgrp); >+ >+#ifdef CONFIG_NUMA_BALANCING >+ if (to_cset->dfl_cgrp->nb_state.nb_disable) >+ WRITE_ONCE(task->numa_cgrp_disable, 1); >+ else >+ WRITE_ONCE(task->numa_cgrp_disable, 0); >+#endif > put_css_set_locked(from_cset); > > } >@@ -3533,6 +3540,11 @@ static int cgroup_events_show(struct seq_file *seq, void *v) > seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); > seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); > >+#ifdef CONFIG_NUMA_BALANCING >+ seq_printf(seq, "numabalancing_disable %d\n", >+ test_bit(CGRP_NUMABALANCING_DISABLE, &cgrp->flags)); >+#endif >+ > return 0; > } > >@@ -3675,6 +3687,118 @@ bool cgroup_psi_enabled(void) > > #endif /* CONFIG_PSI */ > >+#ifdef CONFIG_NUMA_BALANCING >+static void __cgroup_numabalancing_disable_set(struct cgroup *cgrp, bool nb_disable) >+{ >+ struct css_task_iter it; >+ struct task_struct *task; >+ >+ lockdep_assert_held(&cgroup_mutex); >+ >+ spin_lock_irq(&css_set_lock); >+ if (nb_disable) >+ set_bit(CGRP_NUMABALANCING_DISABLE, &cgrp->flags); >+ else >+ clear_bit(CGRP_NUMABALANCING_DISABLE, &cgrp->flags); >+ spin_unlock_irq(&css_set_lock); >+ >+ css_task_iter_start(&cgrp->self, 0, &it); >+ while ((task = css_task_iter_next(&it))) { >+ /* >+ * We don't care about NUMA placement if the task is exiting. >+ * And we don't NUMA balance for kthreads. >+ */ >+ if (task->flags & (PF_EXITING | PF_KTHREAD)) >+ continue; >+ task->numa_cgrp_disable = nb_disable; >+ } >+ css_task_iter_end(&it); >+} >+ >+static void cgroup_numabalancing_disable_set(struct cgroup *cgrp, bool nb_disable) >+{ >+ struct cgroup_subsys_state *css; >+ struct cgroup *dsct; >+ >+ lockdep_assert_held(&cgroup_mutex); >+ >+ /* >+ * Nothing changed? Just exit. >+ */ >+ if (cgrp->nb_state.nb_disable == nb_disable) >+ return; >+ >+ cgrp->nb_state.nb_disable = nb_disable; >+ >+ /* >+ * Propagate changes downwards the cgroup tree. >+ */ >+ css_for_each_descendant_pre(css, &cgrp->self) { >+ dsct = css->cgroup; >+ >+ if (cgroup_is_dead(dsct)) >+ continue; >+ >+ if (nb_disable) { >+ dsct->nb_state.e_nb_disable++; >+ /* >+ * Already be set because of ancestor's settings? >+ */ >+ if (dsct->nb_state.e_nb_disable > 1) >+ continue; >+ } else { >+ dsct->nb_state.e_nb_disable--; >+ /* >+ * Still keep numabalancing disable because of ancestor's settings? >+ */ >+ if (dsct->nb_state.e_nb_disable > 0) >+ continue; >+ >+ WARN_ON_ONCE(dsct->nb_state.e_nb_disable < 0); >+ } >+ >+ /* >+ * Do change actual state: numabalancing disable or enable. >+ */ >+ __cgroup_numabalancing_disable_set(dsct, nb_disable); >+ } >+} >+ >+static int cgroup_numabalancing_disable_show(struct seq_file *seq, void *v) >+{ >+ struct cgroup *cgrp = seq_css(seq)->cgroup; >+ >+ seq_printf(seq, "%d\n", cgrp->nb_state.nb_disable); >+ >+ return 0; >+} >+ >+static ssize_t cgroup_numabalancing_disable_write(struct kernfs_open_file *of, >+ char *buf, size_t nbytes, loff_t off) >+{ >+ struct cgroup *cgrp; >+ ssize_t ret; >+ int nb_disable; >+ >+ ret = kstrtoint(strstrip(buf), 0, &nb_disable); >+ if (ret) >+ return ret; >+ >+ if (nb_disable < 0 || nb_disable > 1) >+ return -ERANGE; >+ >+ cgrp = cgroup_kn_lock_live(of->kn, false); >+ if (!cgrp) >+ return -ENOENT; >+ >+ cgroup_numabalancing_disable_set(cgrp, nb_disable); >+ >+ cgroup_kn_unlock(of->kn); >+ >+ return nbytes; >+} >+#endif /* CONFIG_NUMA_BALANCING */ >+ > static int cgroup_freeze_show(struct seq_file *seq, void *v) > { > struct cgroup *cgrp = seq_css(seq)->cgroup; >@@ -5015,6 +5139,14 @@ static struct cftype cgroup_base_files[] = { > .release = cgroup_pressure_release, > }, > #endif /* CONFIG_PSI */ >+#ifdef CONFIG_NUMA_BALANCING >+ { >+ .name = "cgroup.numabalancing_disable", >+ .flags = CFTYPE_NOT_ON_ROOT, >+ .seq_show = cgroup_numabalancing_disable_show, >+ .write = cgroup_numabalancing_disable_write, >+ }, >+#endif /* CONFIG_NUMA_BALANCING */ > { } /* terminate */ > }; > >@@ -5341,6 +5473,14 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, > set_bit(CGRP_FROZEN, &cgrp->flags); > } > >+#ifdef CONFIG_NUMA_BALANCING >+ /* >+ * New cgroup inherits effective numabalancing disable counter. >+ */ >+ cgrp->nb_state.e_nb_disable = parent->nb_state.e_nb_disable; >+ if (cgrp->nb_state.e_nb_disable) >+ set_bit(CGRP_NUMABALANCING_DISABLE, &cgrp->flags); >+#endif > spin_lock_irq(&css_set_lock); > for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { > cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); >@@ -6285,6 +6425,11 @@ void cgroup_post_fork(struct task_struct *child, > * userspace. > */ > kill = test_bit(CGRP_KILL, &cgrp_flags); >+ >+#ifdef CONFIG_NUMA_BALANCING >+ if (test_bit(CGRP_NUMABALANCING_DISABLE, &cgrp_flags)) >+ child->numa_cgrp_disable = 1; >+#endif > } > > spin_unlock_irq(&css_set_lock); >diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >index ff69f245b939..70d862213f63 100644 >--- a/kernel/sched/fair.c >+++ b/kernel/sched/fair.c >@@ -2629,7 +2629,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) > struct numa_group *ng; > int priv; > >- if (!static_branch_likely(&sched_numa_balancing)) >+ if (!static_branch_likely(&sched_numa_balancing) || >+ READ_ONCE(p->numa_cgrp_disable)) > return; > > /* for example, ksmd faulting in a user's mm */ >@@ -2910,6 +2911,9 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) > if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) > return; > >+ if (READ_ONCE(curr->numa_cgrp_disable)) >+ return; >+ > /* > * Using runtime rather than walltime has the dual advantage that > * we (mostly) drive the selection from busy threads and that the >@@ -2934,7 +2938,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) > int src_nid = cpu_to_node(task_cpu(p)); > int dst_nid = cpu_to_node(new_cpu); > >- if (!static_branch_likely(&sched_numa_balancing)) >+ if (!static_branch_likely(&sched_numa_balancing) || >+ READ_ONCE(p->numa_cgrp_disable)) > return; > > if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) >@@ -7727,7 +7732,8 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) > unsigned long src_weight, dst_weight; > int src_nid, dst_nid, dist; > >- if (!static_branch_likely(&sched_numa_balancing)) >+ if (!static_branch_likely(&sched_numa_balancing) || >+ READ_ONCE(p->numa_cgrp_disable)) > return -1; > > if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) >-- >2.14.1