Regarding CONFIG_MMU=y case, we have a list of inflight OOM victim threads which are chained to oom_reaper_list. Therefore, by doing the same thing for CONFIG_MMU=n case, we can check whether there are inflight OOM victims before starting process/memcg list traversal. Since it is likely that only few threads are chained to oom_reaper_list, checking all victims' OOM domain will not matter. Thus, check whether there are inflight OOM victims before starting process/memcg list traversal. To do so, we need to chain OOM victims until MMF_OOM_SKIP is set. Thus, this patch changes the OOM reaper to wait for an request from the OOM killer using oom_reap_target variable. This change allows the OOM reaper to preferentially reclaim from mm which the OOM killer is waiting for the OOM reaper to reclaim. Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxxxx> Cc: David Rientjes <rientjes@xxxxxxxxxx> Cc: Roman Gushchin <guro@xxxxxx> --- include/linux/oom.h | 1 + include/linux/sched.h | 4 +-- kernel/fork.c | 2 ++ mm/oom_kill.c | 97 +++++++++++++++++++++++++++++---------------------- 4 files changed, 60 insertions(+), 44 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 69864a5..4a147871 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -104,6 +104,7 @@ extern unsigned long oom_badness(struct task_struct *p, extern bool out_of_memory(struct oom_control *oc); extern void exit_oom_victim(void); +extern void exit_oom_mm(struct mm_struct *mm); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); diff --git a/include/linux/sched.h b/include/linux/sched.h index 9e686dc..589fe78 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1173,9 +1173,7 @@ struct task_struct { unsigned long task_state_change; #endif int pagefault_disabled; -#ifdef CONFIG_MMU - struct task_struct *oom_reaper_list; -#endif + struct list_head oom_victim_list; #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif diff --git a/kernel/fork.c b/kernel/fork.c index 276fdc6..ba1260d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1010,6 +1010,8 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + if (unlikely(mm_is_oom_victim(mm))) + exit_oom_mm(mm); mmdrop(mm); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dad0409..a743a8e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -321,18 +321,6 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto next; /* - * This task already has access to memory reserves and is being killed. - * Don't allow any other task to have access to the reserves unless - * the task has MMF_OOM_SKIP because chances that it would release - * any memory is quite low. - */ - if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { - if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) - goto next; - goto abort; - } - - /* * If task is allocating a lot of memory and has been marked to be * killed first if it triggers an oom, then select it. */ @@ -356,11 +344,6 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) oc->chosen_points = points; next: return 0; -abort: - if (oc->chosen) - put_task_struct(oc->chosen); - oc->chosen = (void *)-1UL; - return 1; } /* @@ -478,6 +461,8 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) return false; } +static LIST_HEAD(oom_victim_list); + #ifdef CONFIG_MMU /* * OOM Reaper kernel thread which tries to reap the memory used by the OOM @@ -485,7 +470,7 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) */ static struct task_struct *oom_reaper_th; static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -static struct task_struct *oom_reaper_list; +static struct task_struct *oom_reap_target; bool __oom_reap_task_mm(struct mm_struct *mm) { @@ -598,33 +583,21 @@ static void oom_reap_task(struct task_struct *tsk) debug_show_all_locks(); done: - tsk->oom_reaper_list = NULL; - /* * Hide this mm from OOM killer because it has been either reaped or * somebody can't call up_write(mmap_sem). */ set_bit(MMF_OOM_SKIP, &mm->flags); - - /* Drop a reference taken by mark_oom_victim(). */ - put_task_struct(tsk); } static int oom_reaper(void *unused) { while (true) { - struct task_struct *tsk = NULL; - - wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); - mutex_lock(&oom_lock); - if (oom_reaper_list != NULL) { - tsk = oom_reaper_list; - oom_reaper_list = tsk->oom_reaper_list; - } - mutex_unlock(&oom_lock); - - if (tsk) - oom_reap_task(tsk); + wait_event_freezable(oom_reaper_wait, oom_reap_target != NULL); + oom_reap_task(oom_reap_target); + /* Drop a reference taken by oom_has_pending_victims(). */ + put_task_struct(oom_reap_target); + oom_reap_target = NULL; } return 0; @@ -661,13 +634,8 @@ static void mark_oom_victim(struct task_struct *tsk) if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { mmgrab(tsk->signal->oom_mm); set_bit(MMF_OOM_VICTIM, &mm->flags); -#ifdef CONFIG_MMU get_task_struct(tsk); - tsk->oom_reaper_list = oom_reaper_list; - oom_reaper_list = tsk; - trace_wake_reaper(tsk->pid); - wake_up(&oom_reaper_wait); -#endif + list_add(&tsk->oom_victim_list, &oom_victim_list); } /* @@ -681,6 +649,21 @@ static void mark_oom_victim(struct task_struct *tsk) trace_mark_victim(tsk->pid); } +void exit_oom_mm(struct mm_struct *mm) +{ + struct task_struct *p, *tmp; + + mutex_lock(&oom_lock); + list_for_each_entry_safe(p, tmp, &oom_victim_list, oom_victim_list) { + if (mm != p->signal->oom_mm) + continue; + list_del(&p->oom_victim_list); + /* Drop a reference taken by mark_oom_victim(). */ + put_task_struct(p); + } + mutex_unlock(&oom_lock); +} + /** * exit_oom_victim - note the exit of an OOM victim */ @@ -1020,6 +1003,35 @@ int unregister_oom_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_oom_notifier); +static bool oom_has_pending_victims(struct oom_control *oc) +{ + struct task_struct *p; + + if (is_sysrq_oom(oc)) + return false; + /* + * Since oom_reap_task()/exit_mmap() will set MMF_OOM_SKIP, let's + * wait for pending victims until MMF_OOM_SKIP is set or __mmput() + * completes. + */ + list_for_each_entry(p, &oom_victim_list, oom_victim_list) { + if (oom_unkillable_task(p, oc->memcg, oc->nodemask)) + continue; + if (!test_bit(MMF_OOM_SKIP, &p->signal->oom_mm->flags)) { +#ifdef CONFIG_MMU + if (!oom_reap_target) { + get_task_struct(p); + oom_reap_target = p; + trace_wake_reaper(p->pid); + wake_up(&oom_reaper_wait); + } +#endif + return true; + } + } + return false; +} + /** * out_of_memory - kill the "best" process when we run out of memory * @oc: pointer to struct oom_control @@ -1072,6 +1084,9 @@ bool out_of_memory(struct oom_control *oc) oc->nodemask = NULL; check_panic_on_oom(oc, constraint); + if (oom_has_pending_victims(oc)) + return true; + if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { -- 1.8.3.1