dump_tasks() calls printk() on each userspace process under RCU which might result in RCU stall warning. I proposed introducing timeout for dump_tasks() operation, but Michal Hocko expects that dump_tasks() is either print all or suppress all [1]. Therefore, this patch changes to cache all candidates at oom_evaluate_task() and then print the cached candidates at dump_tasks(). With this patch, dump_tasks() no longer need to call printk() under RCU. Also, dump_tasks() no longer need to check oom_unkillable_task() by traversing all userspace processes, for select_bad_process() already traversed all possible candidates. Also, the OOM killer no longer need to call put_task_struct() from atomic context, and we can simplify refcount handling for __oom_kill_process(). This patch has a user-visible side effect that oom_kill_allocating_task path implies oom_dump_tasks == 0, for oom_evaluate_task() is not called via select_bad_process(). But since the purpose of enabling oom_kill_allocating_task is to kill as quick as possible (i.e. tolerate killing more than needed) whereas the purpose of dump_tasks() which might take minutes is to understand how the OOM killer selected an OOM victim, not printing candidates should be acceptable when the administrator asked the OOM killer to kill current thread. [1] https://lore.kernel.org/linux-mm/20180906115320.GS14951@xxxxxxxxxxxxxx/ Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> Cc: Dmitry Vyukov <dvyukov@xxxxxxxxxx> Cc: Petr Mladek <pmladek@xxxxxxxx> Cc: Sergey Senozhatsky <sergey.senozhatsky@xxxxxxxxx> Cc: Steven Rostedt <rostedt@xxxxxxxxxxx> --- include/linux/sched.h | 1 + mm/oom_kill.c | 44 +++++++++++++++++++------------------------- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1183741..f1736bf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1180,6 +1180,7 @@ struct task_struct { #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; #endif + struct list_head oom_candidate_list; #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7534046..00b594c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -63,6 +63,7 @@ * and mark_oom_victim */ DEFINE_MUTEX(oom_lock); +static LIST_HEAD(oom_candidate_list); #ifdef CONFIG_NUMA /** @@ -333,6 +334,9 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto abort; } + get_task_struct(task); + list_add_tail(&task->oom_candidate_list, &oom_candidate_list); + /* * If task is allocating a lot of memory and has been marked to be * killed first if it triggers an oom, then select it. @@ -350,16 +354,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) if (points == oc->chosen_points && thread_group_leader(oc->chosen)) goto next; select: - if (oc->chosen) - put_task_struct(oc->chosen); - get_task_struct(task); oc->chosen = task; oc->chosen_points = points; next: return 0; abort: - if (oc->chosen) - put_task_struct(oc->chosen); oc->chosen = (void *)-1UL; return 1; } @@ -401,11 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) pr_info("Tasks state (memory values in pages):\n"); pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); - rcu_read_lock(); - for_each_process(p) { - if (oom_unkillable_task(p, memcg, nodemask)) - continue; - + list_for_each_entry(p, &oom_candidate_list, oom_candidate_list) { + cond_resched(); task = find_lock_task_mm(p); if (!task) { /* @@ -424,7 +420,6 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) task->signal->oom_score_adj, task->comm); task_unlock(task); } - rcu_read_unlock(); } static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) @@ -455,7 +450,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) if (is_dump_unreclaim_slabs()) dump_unreclaimable_slab(); } - if (sysctl_oom_dump_tasks) + if (sysctl_oom_dump_tasks && !list_empty(&oom_candidate_list)) dump_tasks(oc->memcg, oc->nodemask); if (p) dump_oom_summary(oc, p); @@ -849,17 +844,11 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) struct mm_struct *mm; bool can_oom_reap = true; - p = find_lock_task_mm(victim); - if (!p) { - put_task_struct(victim); - return; - } else if (victim != p) { - get_task_struct(p); - put_task_struct(victim); - victim = p; - } - /* Get a reference to safely compare mm after task_unlock(victim) */ + victim = find_lock_task_mm(victim); + if (!victim) + return; + get_task_struct(victim); mm = victim->mm; mmgrab(mm); @@ -931,7 +920,6 @@ static int oom_kill_memcg_member(struct task_struct *task, void *message) { if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && !is_global_init(task)) { - get_task_struct(task); __oom_kill_process(task, message); } return 0; @@ -954,7 +942,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message) mark_oom_victim(victim); wake_oom_reaper(victim); task_unlock(victim); - put_task_struct(victim); return; } task_unlock(victim); @@ -1077,7 +1064,6 @@ bool out_of_memory(struct oom_control *oc) if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { - get_task_struct(current); oc->chosen = current; oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); return true; @@ -1099,6 +1085,14 @@ bool out_of_memory(struct oom_control *oc) if (oc->chosen && oc->chosen != (void *)-1UL) oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : "Memory cgroup out of memory"); + while (!list_empty(&oom_candidate_list)) { + struct task_struct *p = list_first_entry(&oom_candidate_list, + struct task_struct, + oom_candidate_list); + + list_del(&p->oom_candidate_list); + put_task_struct(p); + } return !!oc->chosen; } -- 1.8.3.1