cpuset confine processes to processor and memory node subsets. When a process in cpuset triggers oom, it may kill a completely irrelevant process on another numa node, which will not release any memory for this cpuset. It seems that `CONSTRAINT_CPUSET` is not really doing much these days. Using CONSTRAINT_CPUSET, we can easily achieve node aware oom killing by selecting victim from the cpuset which triggers oom. Suggested-by: Michal Hocko <mhocko@xxxxxxxx> Signed-off-by: Gang Li <ligang.bdlg@xxxxxxxxxxxxx> --- This idea comes from a previous patch: mm, oom: Introduce per numa node oom for CONSTRAINT_MEMORY_POLICY https://lore.kernel.org/all/YoJ%2FioXwGTdCywUE@xxxxxxxxxxxxxx/ Any comments are welcome. --- include/linux/cpuset.h | 6 ++++++ kernel/cgroup/cpuset.c | 17 +++++++++++++++++ mm/oom_kill.c | 4 ++++ 3 files changed, 27 insertions(+) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index d58e0476ee8e..7475f613ab90 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -178,6 +178,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } +int cpuset_cgroup_scan_tasks(int (*fn)(struct task_struct *, void *), void *arg); + #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -299,6 +301,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } +static inline int cpuset_cgroup_scan_tasks(int (*fn)(struct task_struct *, void *), void *arg) +{ + return 0; +} #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b474289c15b8..1f1238b4276d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3943,6 +3943,23 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } +int cpuset_cgroup_scan_tasks(int (*fn)(struct task_struct *, void *), void *arg) +{ + int ret = 0; + struct cgroup *cgrp; + struct css_task_iter it; + struct task_struct *task; + + rcu_read_lock(); + css_task_iter_start(&(task_cs(current)->css), CSS_TASK_ITER_PROCS, &it); + while (!ret && (task = css_task_iter_next(&it))) + ret = fn(task, arg); + css_task_iter_end(&it); + rcu_read_unlock(); + + return ret; +} + /* * Collection of memory_pressure is suppressed unless * this flag is enabled by writing "1" to the special diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 46e7e073f137..8cea787b359c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -367,6 +367,8 @@ static void select_bad_process(struct oom_control *oc) if (is_memcg_oom(oc)) mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); + else if (oc->constraint == CONSTRAINT_CPUSET) + cpuset_cgroup_scan_tasks(oom_evaluate_task, oc); else { struct task_struct *p; @@ -427,6 +429,8 @@ static void dump_tasks(struct oom_control *oc) if (is_memcg_oom(oc)) mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); + else if (oc->constraint == CONSTRAINT_CPUSET) + cpuset_cgroup_scan_tasks(dump_task, oc); else { struct task_struct *p; -- 2.20.1