- Why we need a per memcg oom_score_adj setting ? This is easy to deploy and very convenient for container. When we use container, we always treat memcg as a whole, if we have a per memcg oom_score_adj setting we don't need to set it process by process. It will make the user exhausted to set it to all processes in a memcg. In this patch, a file named memory.oom.score_adj is introduced. The valid value of it is from -1000 to +1000, which is same with process-level oom_score_adj. When OOM is invoked, the effective oom_score_adj is as bellow, effective oom_score_adj = original oom_score_adj + memory.oom.score_adj The valid effective value is also from -1000 to +1000. This is something like a hook to re-calculate the oom_score_adj. Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx> Cc: Roman Gushchin <guro@xxxxxx> --- include/linux/memcontrol.h | 24 ++++++++++++++++++++++++ mm/memcontrol.c | 38 ++++++++++++++++++++++++++++++++++++++ mm/oom_kill.c | 20 ++++++++------------ 3 files changed, 70 insertions(+), 12 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2cd4359..d2dbde5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -21,6 +21,7 @@ #include <linux/vmstat.h> #include <linux/writeback.h> #include <linux/page-flags.h> +#include <linux/oom.h> struct mem_cgroup; struct page; @@ -224,6 +225,7 @@ struct mem_cgroup { * Should the OOM killer kill all belonging tasks, had it kill one? */ bool oom_group; + short oom_score_adj; /* protected by memcg_oom_lock */ bool oom_lock; @@ -538,6 +540,23 @@ static inline bool task_in_memcg_oom(struct task_struct *p) return p->memcg_in_oom; } +static inline int mem_cgroup_score_adj(struct task_struct *p, int task_adj) +{ + struct mem_cgroup *memcg; + int adj = task_adj; + + memcg = mem_cgroup_from_task(p); + if (memcg != root_mem_cgroup) { + adj += memcg->oom_score_adj; + if (adj < OOM_SCORE_ADJ_MIN) + adj = OOM_SCORE_ADJ_MIN; + else if (adj > OOM_SCORE_ADJ_MAX) + adj = OOM_SCORE_ADJ_MAX; + } + + return adj; +} + bool mem_cgroup_oom_synchronize(bool wait); struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); @@ -987,6 +1006,11 @@ static inline bool task_in_memcg_oom(struct task_struct *p) return false; } +static inline int mem_cgroup_score_adj(struct task_struct *p, int task_adj) +{ + return task_adj; +} + static inline bool mem_cgroup_oom_synchronize(bool wait) { return false; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6f5c0c5..065285c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5856,6 +5856,38 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } +static int memory_oom_score_adj_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + seq_printf(m, "%d\n", memcg->oom_score_adj); + + return 0; +} + +static ssize_t memory_oom_score_adj_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int oom_score_adj; + int ret; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &oom_score_adj); + if (ret) + return ret; + + if (oom_score_adj > 1000 || oom_score_adj < -1000) + return -EINVAL; + + memcg->oom_score_adj = oom_score_adj; + + return nbytes; +} + static struct cftype memory_files[] = { { .name = "current", @@ -5909,6 +5941,12 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, .seq_show = memory_oom_group_show, .write = memory_oom_group_write, }, + { + .name = "oom.score_adj", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, + .seq_show = memory_oom_score_adj_show, + .write = memory_oom_score_adj_write, + }, { } /* terminate */ }; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eda2e2a..f3b0276 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -212,13 +212,7 @@ unsigned long oom_badness(struct task_struct *p, unsigned long totalpages) * unkillable or have been already oom reaped or the are in * the middle of vfork */ - adj = (long)p->signal->oom_score_adj; - if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_SKIP, &p->mm->flags) || - in_vfork(p)) { - task_unlock(p); - return 0; - } + adj = mem_cgroup_score_adj(p, p->signal->oom_score_adj); /* * The baseline for the badness score is the proportion of RAM that each @@ -404,7 +398,8 @@ static int dump_task(struct task_struct *p, void *arg) task->tgid, task->mm->total_vm, get_mm_rss(task->mm), mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), - task->signal->oom_score_adj, task->comm); + mem_cgroup_score_adj(task, task->signal->oom_score_adj), + task->comm); task_unlock(task); return 0; @@ -453,7 +448,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) { pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, - current->signal->oom_score_adj); + mem_cgroup_score_adj(current, current->signal->oom_score_adj)); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n"); @@ -939,8 +934,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) */ static int oom_kill_memcg_member(struct task_struct *task, void *message) { - if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && - !is_global_init(task)) { + if (mem_cgroup_score_adj(task, task->signal->oom_score_adj) != + OOM_SCORE_ADJ_MIN && !is_global_init(task)) { get_task_struct(task); __oom_kill_process(task, message); } @@ -1085,7 +1080,8 @@ bool out_of_memory(struct oom_control *oc) if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current) && oom_cpuset_eligible(current, oc) && - current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + mem_cgroup_score_adj(current, current->signal->oom_score_adj) != + OOM_SCORE_ADJ_MIN) { get_task_struct(current); oc->chosen = current; oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); -- 1.8.3.1