Roman, will you check this cleanup patch? This patch applies on top of next-20180724. I assumed that your series do not kill processes which current thread should not wait for termination. >From 86ba99fbf73a9eda0df5ee4ae70c075781e83f81 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> Date: Tue, 24 Jul 2018 14:00:45 +0900 Subject: [PATCH] mm,oom: Check pending victims earlier in out_of_memory(). The "mm, oom: cgroup-aware OOM killer" patchset introduced INFLIGHT_VICTIM in order to replace open-coded ((void *)-1UL). But (regarding CONFIG_MMU=y case) we have a list of inflight OOM victim threads which are connected to oom_reaper_list. Thus we can check whether there are inflight OOM victims before starting process/memcg list traversal. Since it is likely that only few threads are linked to oom_reaper_list, checking all victims' OOM domain will not matter. Thus, check whether there are inflight OOM victims before starting process/memcg list traversal and eliminate the "abort" path. Note that this patch could temporarily regress CONFIG_MMU=n kernels because this patch selects same victims rather than waits for victims if CONFIG_MMU=n. But if it makes difference, we had better revert commit 212925802454672e ("mm: oom: let oom_reap_task and exit_mmap run concurrently") because that commit removed setting of MMF_OOM_SKIP from __mmput() based on an assumption that OOM lockup on CONFIG_MMU=n kernels won't happen (in other words, OOM victims shall terminate immediately). This patch also mitigates OOM lockup caused by schedule_timeout_killable(1) with oom_lock held. Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> Cc: Roman Gushchin <guro@xxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx> Cc: David Rientjes <rientjes@xxxxxxxxxx> Cc: Tejun Heo <tj@xxxxxxxxxx> --- include/linux/memcontrol.h | 9 ++- include/linux/oom.h | 9 +-- include/linux/sched.h | 2 +- mm/memcontrol.c | 61 ++++---------------- mm/oom_kill.c | 138 ++++++++++++++++++++++----------------------- 5 files changed, 84 insertions(+), 135 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 088ae04..3c202c8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -420,8 +420,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); -int mem_cgroup_scan_tasks(struct mem_cgroup *, - int (*)(struct task_struct *, void *), void *); +void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + void (*fn)(struct task_struct *, void *), void *arg); static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { @@ -926,10 +926,9 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root, { } -static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, - int (*fn)(struct task_struct *, void *), void *arg) +static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + void (*fn)(struct task_struct *, void *), void *arg) { - return 0; } static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) diff --git a/include/linux/oom.h b/include/linux/oom.h index a16a155..6eb72b9 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -10,13 +10,6 @@ #include <linux/sched/coredump.h> /* MMF_* */ #include <linux/mm.h> /* VM_FAULT* */ - -/* - * Special value returned by victim selection functions to indicate - * that are inflight OOM victims. - */ -#define INFLIGHT_VICTIM ((void *)-1UL) - struct zonelist; struct notifier_block; struct mem_cgroup; @@ -131,7 +124,7 @@ extern unsigned long oom_badness(struct task_struct *p, extern struct task_struct *find_lock_task_mm(struct task_struct *p); -extern int oom_evaluate_task(struct task_struct *task, void *arg); +extern void oom_evaluate_task(struct task_struct *task, void *arg); /* sysctls */ extern int sysctl_oom_dump_tasks; diff --git a/include/linux/sched.h b/include/linux/sched.h index 4c66d03..a2d4719 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1175,7 +1175,7 @@ struct task_struct { #endif int pagefault_disabled; #ifdef CONFIG_MMU - struct task_struct *oom_reaper_list; + struct list_head oom_victim_list; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4603ad7..2b33456 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1058,33 +1058,29 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) * @arg: argument passed to @fn * * This function iterates over tasks attached to @memcg or to any of its - * descendants and calls @fn for each task. If @fn returns a non-zero - * value, the function breaks the iteration loop and returns the value. - * Otherwise, it will iterate over all tasks and return 0. + * descendants and calls @fn for each task. * * If memcg is the root memory cgroup, this function will iterate only * over tasks belonging directly to the root memory cgroup. */ -int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, - int (*fn)(struct task_struct *, void *), void *arg) +void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + void (*fn)(struct task_struct *, void *), void *arg) { struct mem_cgroup *iter; - int ret = 0; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; struct task_struct *task; css_task_iter_start(&iter->css, 0, &it); - while (!ret && (task = css_task_iter_next(&it))) - ret = fn(task, arg); + while (!(task = css_task_iter_next(&it))) + fn(task, arg); css_task_iter_end(&it); - if (ret || memcg == root_mem_cgroup) { + if (memcg == root_mem_cgroup) { mem_cgroup_iter_break(memcg, iter); break; } } - return ret; } /** @@ -2854,7 +2850,6 @@ static long memcg_oom_badness(struct mem_cgroup *memcg, /* * Checks if the given memcg is a valid OOM victim and returns a number, * which means the folowing: - * -1: there are inflight OOM victim tasks, belonging to the memcg * 0: memcg is not eligible, e.g. all belonging tasks are protected * by oom_score_adj set to OOM_SCORE_ADJ_MIN * >0: memcg is eligible, and the returned value is an estimation @@ -2874,9 +2869,6 @@ static long oom_evaluate_memcg(struct mem_cgroup *memcg, * leaf memory cgroups, so we approximate it's oom_score * by summing oom_score of all belonging tasks, which are * owners of their mm structs. - * - * If there are inflight OOM victim tasks inside - * the root memcg, we return -1. */ if (memcg == root_mem_cgroup) { struct css_task_iter it; @@ -2884,24 +2876,9 @@ static long oom_evaluate_memcg(struct mem_cgroup *memcg, long score = 0; css_task_iter_start(&memcg->css, 0, &it); - while ((task = css_task_iter_next(&it))) { - if (tsk_is_oom_victim(task) && - !test_bit(MMF_OOM_SKIP, - &task->signal->oom_mm->flags)) { - score = -1; - break; - } - - task_lock(task); - if (!task->mm) { - task_unlock(task); - continue; - } - task_unlock(task); - + while ((task = css_task_iter_next(&it))) score += oom_badness(task, memcg, nodemask, totalpages); - } css_task_iter_end(&it); return score; @@ -2912,25 +2889,17 @@ static long oom_evaluate_memcg(struct mem_cgroup *memcg, * * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN * as unkillable. - * - * If there are inflight OOM victim tasks inside the memcg, - * we return -1. */ css_task_iter_start(&memcg->css, 0, &it); while ((task = css_task_iter_next(&it))) { - if (!eligible && - task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) + if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { eligible = 1; - - if (tsk_is_oom_victim(task) && - !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) { - eligible = -1; break; } } css_task_iter_end(&it); - if (eligible <= 0) + if (eligible == 0) return eligible; return memcg_oom_badness(memcg, nodemask, totalpages); @@ -2994,16 +2963,6 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc) if (score == 0) continue; - /* - * If there are inflight OOM victims, we don't need - * to look further for new victims. - */ - if (score == -1) { - oc->chosen_memcg = INFLIGHT_VICTIM; - mem_cgroup_iter_break(root, iter); - break; - } - group_score += score; if (group_score > oc->chosen_points) { @@ -3012,7 +2971,7 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc) } } - if (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM) + if (oc->chosen_memcg) css_get(&oc->chosen_memcg->css); rcu_read_unlock(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 104ef4a..b3e6157 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -312,25 +312,13 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; } -int oom_evaluate_task(struct task_struct *task, void *arg) +void oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; unsigned long points; if (oom_unkillable_task(task, NULL, oc->nodemask)) - goto next; - - /* - * This task already has access to memory reserves and is being killed. - * Don't allow any other task to have access to the reserves unless - * the task has MMF_OOM_SKIP because chances that it would release - * any memory is quite low. - */ - if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { - if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) - goto next; - goto abort; - } + return; /* * If task is allocating a lot of memory and has been marked to be @@ -343,29 +331,22 @@ int oom_evaluate_task(struct task_struct *task, void *arg) points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); if (!points || points < oc->chosen_points) - goto next; + return; /* Prefer thread group leaders for display purposes */ if (points == oc->chosen_points && thread_group_leader(oc->chosen_task)) - goto next; + return; select: if (oc->chosen_task) put_task_struct(oc->chosen_task); get_task_struct(task); oc->chosen_task = task; oc->chosen_points = points; -next: - return 0; -abort: - if (oc->chosen_task) - put_task_struct(oc->chosen_task); - oc->chosen_task = INFLIGHT_VICTIM; - return 1; } /* * Simple selection loop. We choose the process with the highest number of - * 'points'. In case scan was aborted, oc->chosen_task is set to -1. + * 'points'. */ static void select_bad_process(struct oom_control *oc) { @@ -376,8 +357,7 @@ static void select_bad_process(struct oom_control *oc) rcu_read_lock(); for_each_process(p) - if (oom_evaluate_task(p, oc)) - break; + oom_evaluate_task(p, oc); rcu_read_unlock(); } @@ -492,7 +472,7 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) */ static struct task_struct *oom_reaper_th; static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -static struct task_struct *oom_reaper_list; +static LIST_HEAD(oom_victim_list); static DEFINE_SPINLOCK(oom_reaper_lock); bool __oom_reap_task_mm(struct mm_struct *mm) @@ -598,14 +578,16 @@ static void oom_reap_task(struct task_struct *tsk) debug_show_all_locks(); done: - tsk->oom_reaper_list = NULL; - /* * Hide this mm from OOM killer because it has been either reaped or * somebody can't call up_write(mmap_sem). */ set_bit(MMF_OOM_SKIP, &mm->flags); + spin_lock(&oom_reaper_lock); + list_del(&tsk->oom_victim_list); + spin_unlock(&oom_reaper_lock); + /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); } @@ -615,12 +597,13 @@ static int oom_reaper(void *unused) while (true) { struct task_struct *tsk = NULL; - wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); + wait_event_freezable(oom_reaper_wait, + !list_empty(&oom_victim_list)); spin_lock(&oom_reaper_lock); - if (oom_reaper_list != NULL) { - tsk = oom_reaper_list; - oom_reaper_list = tsk->oom_reaper_list; - } + if (!list_empty(&oom_victim_list)) + tsk = list_first_entry(&oom_victim_list, + struct task_struct, + oom_victim_list); spin_unlock(&oom_reaper_lock); if (tsk) @@ -633,14 +616,13 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { /* tsk is already queued? */ - if (tsk == oom_reaper_list || tsk->oom_reaper_list) + if (tsk->oom_victim_list.next) return; get_task_struct(tsk); spin_lock(&oom_reaper_lock); - tsk->oom_reaper_list = oom_reaper_list; - oom_reaper_list = tsk; + list_add_tail(&tsk->oom_victim_list, &oom_victim_list); spin_unlock(&oom_reaper_lock); trace_wake_reaper(tsk->pid); wake_up(&oom_reaper_wait); @@ -981,17 +963,18 @@ static void oom_kill_process(struct oom_control *oc, const char *message) __oom_kill_process(victim); } -static int oom_kill_memcg_member(struct task_struct *task, void *unused) +static void oom_kill_memcg_member(struct task_struct *task, void *unused) { get_task_struct(task); __oom_kill_process(task); - return 0; } static bool oom_kill_memcg_victim(struct oom_control *oc) { - if (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM) - return oc->chosen_memcg; + bool ret = true; + + if (oc->chosen_memcg == NULL) + return false; /* * If memory.oom_group is set, kill all tasks belonging to the sub-tree @@ -1001,23 +984,18 @@ static bool oom_kill_memcg_victim(struct oom_control *oc) if (mem_cgroup_oom_group(oc->chosen_memcg)) { mem_cgroup_scan_tasks(oc->chosen_memcg, oom_kill_memcg_member, NULL); - /* We have one or more terminating processes at this point. */ - oc->chosen_task = INFLIGHT_VICTIM; } else { oc->chosen_points = 0; oc->chosen_task = NULL; mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc); - if (oc->chosen_task == NULL || - oc->chosen_task == INFLIGHT_VICTIM) - goto out; - - __oom_kill_process(oc->chosen_task); + if (!oc->chosen_task) + ret = false; + else + __oom_kill_process(oc->chosen_task); } - -out: mem_cgroup_put(oc->chosen_memcg); - return oc->chosen_task; + return ret; } /* @@ -1058,6 +1036,34 @@ int unregister_oom_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_oom_notifier); +static bool oom_has_pending_victims(struct oom_control *oc) +{ +#ifdef CONFIG_MMU + struct task_struct *p; + + if (is_sysrq_oom(oc)) + return false; + /* + * Since oom_reap_task_mm()/exit_mmap() will set MMF_OOM_SKIP, let's + * wait for pending victims until MMF_OOM_SKIP is set. + */ + spin_lock(&oom_reaper_lock); + list_for_each_entry(p, &oom_victim_list, oom_victim_list) + if (!oom_unkillable_task(p, oc->memcg, oc->nodemask) && + !test_bit(MMF_OOM_SKIP, &p->signal->oom_mm->flags)) + break; + spin_unlock(&oom_reaper_lock); + return p != NULL; +#else + /* + * Since nobody except __oom_kill_process() sets MMF_OOM_SKIP, waiting + * for pending victims until MMF_OOM_SKIP is set is useless. Therefore, + * simply let the OOM killer select pending victims again. + */ + return false; +#endif +} + /** * out_of_memory - kill the "best" process when we run out of memory * @oc: pointer to struct oom_control @@ -1070,7 +1076,6 @@ int unregister_oom_notifier(struct notifier_block *nb) bool out_of_memory(struct oom_control *oc) { unsigned long freed = 0; - bool delay = false; /* if set, delay next allocation attempt */ oc->constraint = CONSTRAINT_NONE; if (oom_killer_disabled) @@ -1112,6 +1117,9 @@ bool out_of_memory(struct oom_control *oc) oc->nodemask = NULL; check_panic_on_oom(oc); + if (oom_has_pending_victims(oc)) + return true; + if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { @@ -1121,30 +1129,20 @@ bool out_of_memory(struct oom_control *oc) return true; } - if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) { - delay = true; - goto out; - } + if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) + return true; select_bad_process(oc); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { + if (!oc->chosen_task) { + if (is_sysrq_oom(oc) || is_memcg_oom(oc)) + return false; dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } - if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) - oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : - "Memory cgroup out of memory"); - -out: - /* - * Give the killed process a good chance to exit before trying - * to allocate memory again. - */ - if (delay) - schedule_timeout_killable(1); - - return !!oc->chosen_task; + oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : + "Memory cgroup out of memory"); + return true; } /* -- 1.8.3.1