The oom reaper ensures forward progress by setting MMF_OOM_SKIP itself if it cannot reap an mm. This can happen for a variety of reasons, including: - the inability to grab mm->mmap_sem in a sufficient amount of time, - when the mm has blockable mmu notifiers that could cause the oom reaper to stall indefinitely, but we can also add a third when the oom reaper can "reap" an mm but doing so is unlikely to free any amount of memory: - when the mm's memory is fully mlocked. When all memory is mlocked, the oom reaper will not be able to free any substantial amount of memory. It sets MMF_OOM_SKIP before the victim can unmap and free its memory in exit_mmap() and subsequent oom victims are chosen unnecessarily. This is trivial to reproduce if all eligible processes on the system have mlocked their memory: the oom killer calls panic() even though forward progress can be made. This is the same issue where the exit path sets MMF_OOM_SKIP before unmapping memory and additional processes can be chosen unnecessarily because the oom killer is racing with exit_mmap(). We can't simply defer setting MMF_OOM_SKIP, however, because if there is a true oom livelock in progress, it never gets set and no additional killing is possible. To fix this, this patch introduces a per-mm reaping timeout, initially set at 10s. It requires that the oom reaper's list becomes a properly linked list so that other mm's may be reaped while waiting for an mm's timeout to expire. The exit path will now set MMF_OOM_SKIP only after all memory has been freed, so additional oom killing is justified, and rely on MMF_UNSTABLE to determine when it can race with the oom reaper. The oom reaper will now set MMF_OOM_SKIP only after the reap timeout has lapsed because it can no longer guarantee forward progress. The reaping timeout is intentionally set for a substantial amount of time since oom livelock is a very rare occurrence and it's better to optimize for preventing additional (unnecessary) oom killing than a scenario that is much more unlikely. Signed-off-by: David Rientjes <rientjes@xxxxxxxxxx> --- include/linux/mm_types.h | 4 ++ include/linux/sched.h | 2 +- mm/mmap.c | 12 +++--- mm/oom_kill.c | 85 ++++++++++++++++++++++++++-------------- 4 files changed, 66 insertions(+), 37 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -462,6 +462,10 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif +#ifdef CONFIG_MMU + /* When to give up on oom reaping this mm */ + unsigned long reap_timeout; +#endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif diff --git a/include/linux/sched.h b/include/linux/sched.h --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1151,7 +1151,7 @@ struct task_struct { #endif int pagefault_disabled; #ifdef CONFIG_MMU - struct task_struct *oom_reaper_list; + struct list_head oom_reap_list; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3059,11 +3059,10 @@ void exit_mmap(struct mm_struct *mm) if (unlikely(mm_is_oom_victim(mm))) { /* * Manually reap the mm to free as much memory as possible. - * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard - * this mm from further consideration. Taking mm->mmap_sem for - * write after setting MMF_OOM_SKIP will guarantee that the oom - * reaper will not run on this mm again after mmap_sem is - * dropped. + * Then, set MMF_UNSTABLE to avoid racing with the oom reaper. + * Taking mm->mmap_sem for write after setting MMF_UNSTABLE will + * guarantee that the oom reaper will not run on this mm again + * after mmap_sem is dropped. * * Nothing can be holding mm->mmap_sem here and the above call * to mmu_notifier_release(mm) ensures mmu notifier callbacks in @@ -3077,7 +3076,7 @@ void exit_mmap(struct mm_struct *mm) __oom_reap_task_mm(mm); mutex_unlock(&oom_lock); - set_bit(MMF_OOM_SKIP, &mm->flags); + set_bit(MMF_UNSTABLE, &mm->flags); down_write(&mm->mmap_sem); up_write(&mm->mmap_sem); } @@ -3105,6 +3104,7 @@ void exit_mmap(struct mm_struct *mm) unmap_vmas(&tlb, vma, 0, -1); free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); + set_bit(MMF_OOM_SKIP, &mm->flags); /* * Walk the list again, actually closing and freeing it, diff --git a/mm/oom_kill.c b/mm/oom_kill.c --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -476,7 +476,7 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) */ static struct task_struct *oom_reaper_th; static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -static struct task_struct *oom_reaper_list; +static LIST_HEAD(oom_reaper_list); static DEFINE_SPINLOCK(oom_reaper_lock); void __oom_reap_task_mm(struct mm_struct *mm) @@ -558,12 +558,12 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) } /* - * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't - * work on the mm anymore. The check for MMF_OOM_SKIP must run + * MMF_UNSTABLE is set by exit_mmap when the OOM reaper can't + * work on the mm anymore. The check for MMF_UNSTABLE must run * under mmap_sem for reading because it serializes against the * down_write();up_write() cycle in exit_mmap(). */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (test_bit(MMF_UNSTABLE, &mm->flags)) { up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); goto unlock_oom; @@ -589,31 +589,49 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) #define MAX_OOM_REAP_RETRIES 10 static void oom_reap_task(struct task_struct *tsk) { - int attempts = 0; struct mm_struct *mm = tsk->signal->oom_mm; + bool ret = true; - /* Retry the down_read_trylock(mmap_sem) a few times */ - while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm)) - schedule_timeout_idle(HZ/10); + /* + * If this mm has either been fully unmapped, or the oom reaper has + * given up on it, nothing left to do except drop the refcount. + */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) + goto drop; - if (attempts <= MAX_OOM_REAP_RETRIES || - test_bit(MMF_OOM_SKIP, &mm->flags)) - goto done; + /* + * If this mm has already been reaped, doing so again will not likely + * free additional memory. + */ + if (!test_bit(MMF_UNSTABLE, &mm->flags)) + ret = oom_reap_task_mm(tsk, mm); + + if (time_after(jiffies, mm->reap_timeout)) { + if (!test_bit(MMF_OOM_SKIP, &mm->flags)) { + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + debug_show_all_locks(); - pr_info("oom_reaper: unable to reap pid:%d (%s)\n", - task_pid_nr(tsk), tsk->comm); - debug_show_all_locks(); + /* + * Reaping has failed for the timeout period, so give up + * and allow additional processes to be oom killed. + */ + set_bit(MMF_OOM_SKIP, &mm->flags); + } + goto drop; + } -done: - tsk->oom_reaper_list = NULL; + if (!ret) + schedule_timeout_idle(HZ/10); - /* - * Hide this mm from OOM killer because it has been either reaped or - * somebody can't call up_write(mmap_sem). - */ - set_bit(MMF_OOM_SKIP, &mm->flags); + /* Enqueue to be reaped again */ + spin_lock(&oom_reaper_lock); + list_add(&tsk->oom_reap_list, &oom_reaper_list); + spin_unlock(&oom_reaper_lock); + return; - /* Drop a reference taken by wake_oom_reaper */ +drop: + /* Drop the reference taken by wake_oom_reaper() */ put_task_struct(tsk); } @@ -622,11 +640,13 @@ static int oom_reaper(void *unused) while (true) { struct task_struct *tsk = NULL; - wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); + wait_event_freezable(oom_reaper_wait, + !list_empty(&oom_reaper_list)); spin_lock(&oom_reaper_lock); - if (oom_reaper_list != NULL) { - tsk = oom_reaper_list; - oom_reaper_list = tsk->oom_reaper_list; + if (!list_empty(&oom_reaper_list)) { + tsk = list_entry(&oom_reaper_list, struct task_struct, + oom_reap_list); + list_del(&tsk->oom_reap_list); } spin_unlock(&oom_reaper_lock); @@ -637,17 +657,22 @@ static int oom_reaper(void *unused) return 0; } +/* How long to wait to oom reap an mm before selecting another process */ +#define OOM_REAP_TIMEOUT_MSECS (10 * 1000) static void wake_oom_reaper(struct task_struct *tsk) { - /* tsk is already queued? */ - if (tsk == oom_reaper_list || tsk->oom_reaper_list) + /* + * Set the reap timeout; if it's already set, the mm is enqueued and + * this tsk can be ignored. + */ + if (cmpxchg(&tsk->signal->oom_mm->reap_timeout, 0UL, + jiffies + msecs_to_jiffies(OOM_REAP_TIMEOUT_MSECS))) return; get_task_struct(tsk); spin_lock(&oom_reaper_lock); - tsk->oom_reaper_list = oom_reaper_list; - oom_reaper_list = tsk; + list_add(&tsk->oom_reap_list, &oom_reaper_list); spin_unlock(&oom_reaper_lock); trace_wake_reaper(tsk->pid); wake_up(&oom_reaper_wait);