Hi Oleg, does this sound sane/correct to you? On Thu 01-09-16 11:51:03, Michal Hocko wrote: > From: Michal Hocko <mhocko@xxxxxxxx> > > mark_oom_victim and exit_oom_victim are used for oom_killer_disable > which should block as long as there any any oom victims alive. Up to now > we have relied on TIF_MEMDIE task flag to count how many oom victim > we have. This is not optimal because only one thread receives this flag > at the time while the whole process (thread group) is killed and should > die. As a result we do not thaw the whole thread group and so a multi > threaded process can leave some threads behind in the fridge. We really > want to thaw all the threads. > > This is not all that easy because there is no reliable way to count > threads in the process as the oom killer might race with copy_process. > So marking all threads with TIF_MEMDIE and increment oom_victims > accordingly is not safe. Also TIF_MEMDIE flag should just die so > we should better come up with a different approach. > > All we need to guarantee is that exit_oom_victim is called at the time > when no further access to (possibly suspended) devices or generate other > IO (which would clobber suspended image and only once per process) > is possible. It seems we can rely on exit_notify for that because we > already have to detect the last thread to do a cleanup. Let's propagate > that information up to do_exit and only call exit_oom_victim for such > a thread. With this in place we can safely increment oom_victims only > once per thread group and thaw all the threads from the process. > freezing_slow_path can also rely on tsk_is_oom_victim as well now. > > exit_io_context is currently called after exit_notify but it seems it is > safe to call it right before exit_notify because that is passed > exit_files. > > Cc: Oleg Nesterov <oleg@xxxxxxxxxx> > Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx> > Signed-off-by: Michal Hocko <mhocko@xxxxxxxx> > --- > include/linux/sched.h | 2 +- > kernel/exit.c | 38 ++++++++++++++++++++++++++++---------- > kernel/freezer.c | 3 ++- > mm/oom_kill.c | 29 +++++++++++++++++------------ > 4 files changed, 48 insertions(+), 24 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 770d01e7a68e..605e40b47992 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -2660,7 +2660,7 @@ static inline void kernel_signal_stop(void) > schedule(); > } > > -extern void release_task(struct task_struct * p); > +extern bool release_task(struct task_struct * p); > extern int send_sig_info(int, struct siginfo *, struct task_struct *); > extern int force_sigsegv(int, struct task_struct *); > extern int force_sig_info(int, struct siginfo *, struct task_struct *); > diff --git a/kernel/exit.c b/kernel/exit.c > index 914088e8c2ac..c762416dbed1 100644 > --- a/kernel/exit.c > +++ b/kernel/exit.c > @@ -165,10 +165,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp) > } > > > -void release_task(struct task_struct *p) > +bool release_task(struct task_struct *p) > { > struct task_struct *leader; > int zap_leader; > + bool last = false; > repeat: > /* don't need to get the RCU readlock here - the process is dead and > * can't be modifying its own credentials. But shut RCU-lockdep up */ > @@ -197,8 +198,10 @@ void release_task(struct task_struct *p) > * then we are the one who should release the leader. > */ > zap_leader = do_notify_parent(leader, leader->exit_signal); > - if (zap_leader) > + if (zap_leader) { > leader->exit_state = EXIT_DEAD; > + last = true; > + } > } > > write_unlock_irq(&tasklist_lock); > @@ -208,6 +211,8 @@ void release_task(struct task_struct *p) > p = leader; > if (unlikely(zap_leader)) > goto repeat; > + > + return last; > } > > /* > @@ -434,8 +439,6 @@ static void exit_mm(struct task_struct *tsk) > task_unlock(tsk); > mm_update_next_owner(mm); > mmput(mm); > - if (test_thread_flag(TIF_MEMDIE)) > - exit_oom_victim(); > } > > static struct task_struct *find_alive_thread(struct task_struct *p) > @@ -584,12 +587,15 @@ static void forget_original_parent(struct task_struct *father, > /* > * Send signals to all our closest relatives so that they know > * to properly mourn us.. > + * > + * Returns true if this is the last thread from the thread group > */ > -static void exit_notify(struct task_struct *tsk, int group_dead) > +static bool exit_notify(struct task_struct *tsk, int group_dead) > { > bool autoreap; > struct task_struct *p, *n; > LIST_HEAD(dead); > + bool last = false; > > write_lock_irq(&tasklist_lock); > forget_original_parent(tsk, &dead); > @@ -606,6 +612,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) > } else if (thread_group_leader(tsk)) { > autoreap = thread_group_empty(tsk) && > do_notify_parent(tsk, tsk->exit_signal); > + last = thread_group_empty(tsk); > } else { > autoreap = true; > } > @@ -621,8 +628,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead) > > list_for_each_entry_safe(p, n, &dead, ptrace_entry) { > list_del_init(&p->ptrace_entry); > - release_task(p); > + if (release_task(p) && p == tsk) > + last = true; > } > + > + return last; > } > > #ifdef CONFIG_DEBUG_STACK_USAGE > @@ -766,7 +776,18 @@ void do_exit(long code) > TASKS_RCU(preempt_disable()); > TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); > TASKS_RCU(preempt_enable()); > - exit_notify(tsk, group_dead); > + > + if (tsk->io_context) > + exit_io_context(tsk); > + > + /* > + * Notify oom_killer_disable that the last oom thread is exiting. > + * We might have more threads running at this point but none of them > + * will access any devices behind this point. > + */ > + if (exit_notify(tsk, group_dead) && tsk_is_oom_victim(current)) > + exit_oom_victim(); > + > proc_exit_connector(tsk); > mpol_put_task_policy(tsk); > #ifdef CONFIG_FUTEX > @@ -784,9 +805,6 @@ void do_exit(long code) > */ > tsk->flags |= PF_EXITPIDONE; > > - if (tsk->io_context) > - exit_io_context(tsk); > - > if (tsk->splice_pipe) > free_pipe_info(tsk->splice_pipe); > > diff --git a/kernel/freezer.c b/kernel/freezer.c > index 6f56a9e219fa..c6a64474a408 100644 > --- a/kernel/freezer.c > +++ b/kernel/freezer.c > @@ -10,6 +10,7 @@ > #include <linux/syscalls.h> > #include <linux/freezer.h> > #include <linux/kthread.h> > +#include <linux/oom.h> > > /* total number of freezing conditions in effect */ > atomic_t system_freezing_cnt = ATOMIC_INIT(0); > @@ -42,7 +43,7 @@ bool freezing_slow_path(struct task_struct *p) > if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) > return false; > > - if (test_tsk_thread_flag(p, TIF_MEMDIE)) > + if (tsk_is_oom_victim(p)) > return false; > > if (pm_nosig_freezing || cgroup_freezing(p)) > diff --git a/mm/oom_kill.c b/mm/oom_kill.c > index e26529edcee3..5dec6321ac7b 100644 > --- a/mm/oom_kill.c > +++ b/mm/oom_kill.c > @@ -649,33 +649,38 @@ static inline void wake_oom_reaper(struct task_struct *tsk) > static void mark_oom_victim(struct task_struct *tsk) > { > struct mm_struct *mm = tsk->mm; > + struct task_struct *t; > > WARN_ON(oom_killer_disabled); > - /* OOM killer might race with memcg OOM */ > - if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) > - return; > > /* oom_mm is bound to the signal struct life time. */ > - if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) > + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { > atomic_inc(&tsk->signal->oom_mm->mm_count); > > + /* Only count thread groups */ > + atomic_inc(&oom_victims); > + } > + > /* > - * Make sure that the task is woken up from uninterruptible sleep > - * if it is frozen because OOM killer wouldn't be able to free > - * any memory and livelock. freezing_slow_path will tell the freezer > - * that TIF_MEMDIE tasks should be ignored. > + * Make sure that the the whole thread groupd is woken up from > + * uninterruptible sleep if it is frozen because the oom victim > + * will free its memory completely only after exit. > + * freezing_slow_path will tell the freezer that oom victims > + * should be ignored. > */ > - __thaw_task(tsk); > - atomic_inc(&oom_victims); > + rcu_read_lock(); > + for_each_thread(tsk, t) > + __thaw_task(t); > + rcu_read_unlock(); > } > > /** > * exit_oom_victim - note the exit of an OOM victim > + * > + * Has to be called only once per thread group. > */ > void exit_oom_victim(void) > { > - clear_thread_flag(TIF_MEMDIE); > - > if (!atomic_dec_return(&oom_victims)) > wake_up_all(&oom_victims_wait); > } > -- > 2.8.1 > -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>