And here again. Get rid of the mm_users check because it is not reliable. --- >From 7681e91cba6bcd45f9ebc5d2dcee3df06c687296 Mon Sep 17 00:00:00 2001 From: Michal Hocko <mhocko@xxxxxxxx> Date: Wed, 25 May 2016 19:50:34 +0200 Subject: [PATCH] mm, oom_adj: make sure processes sharing mm have same view of oom_score_adj oom_score_adj is shared for the thread groups (via struct signal) but this is not sufficient to cover processes sharing mm (CLONE_VM without CLONE_THREAD resp. CLONE_SIGHAND) and so we can easily end up in a situation when some processes update their oom_score_adj and confuse the oom killer. In the worst case some of those processes might hide from oom killer altogether via OOM_SCORE_ADJ_MIN while others are eligible. OOM killer would then pick up those eligible but won't be allowed to kill others sharing the same mm so the mm wouldn't release the mm and so the memory. It would be ideal to have the oom_score_adj per mm_struct becuase that is the natural entity OOM killer considers. But this will not work because some programs are doing vfork() set_oom_adj() exec() We can achieve the same though. oom_score_adj write handler can set the oom_score_adj for all processes sharing the same mm if the task is not in the middle of vfork. As a result all the processes will share the same oom_score_adj. The current implementation is rather pessimistic and checks all the existing processes by default if there are more than 1 holder of the mm but we do not have any reliable way to check for external users yet. Note that we have to serialize all the oom_score_adj writers now to guarantee they do not interleave and generate inconsistent results. Signed-off-by: Michal Hocko <mhocko@xxxxxxxx> --- fs/proc/base.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 2 ++ mm/oom_kill.c | 2 +- 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 0afc77d4d84a..fa0b3ca94dfb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1043,10 +1043,13 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { + static DEFINE_MUTEX(oom_adj_mutex); + struct mm_struct *mm = NULL; struct task_struct *task; unsigned long flags; int err = 0; + mutex_lock(&oom_adj_mutex); task = get_proc_task(file_inode(file)); if (!task) { err = -ESRCH; @@ -1079,6 +1082,21 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) } } + /* + * Make sure we will check other processes sharing the mm if this is + * not vfrok which wants its own oom_score_adj. + * pin the mm so it doesn't go away and get reused. + */ + if (!task->vfork_done) { + struct task_struct *p = find_lock_task_mm(task); + + if (p && atomic_read(&p->mm->mm_users) > 1) { + mm = p->mm; + atomic_inc(&mm->mm_count); + task_unlock(p); + } + } + task->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = (short)oom_adj; @@ -1087,7 +1105,25 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) unlock_task_sighand(task, &flags); err_put_task: put_task_struct(task); + + if (mm) { + struct task_struct *p; + + rcu_read_lock(); + for_each_process(p) { + task_lock(p); + if (!p->vfork_done && process_shares_mm(p, mm)) { + p->signal->oom_score_adj = oom_adj; + if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) + p->signal->oom_score_adj_min = (short)oom_adj; + } + task_unlock(p); + } + rcu_read_unlock(); + mmdrop(mm); + } out: + mutex_unlock(&oom_adj_mutex); return err; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 05102822912c..b44d3d792a00 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2248,6 +2248,8 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) } #endif /* __HAVE_ARCH_GATE_AREA */ +extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); + #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; int drop_caches_sysctl_handler(struct ctl_table *, int, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1685890d424e..268b76b88220 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -416,7 +416,7 @@ bool oom_killer_disabled __read_mostly; * task's threads: if one of those is using this mm then this task was also * using it. */ -static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) { struct task_struct *t; -- 2.8.1 -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>