Michal Hocko wrote: > > We need to teach the OOM reaper stop reaping as soon as entering exit_mmap(). > > Maybe let the OOM reaper poll for progress (e.g. none of get_mm_counter(mm, *) > > decreased for last 1 second) ? > > Can we start simple and build a more elaborate heuristics on top _please_? > In other words holding the mmap_sem for write for oom victims in > exit_mmap should handle the problem. We can then enhance this to probe > for progress or any other clever tricks if we find out that the race > happens too often and we kill more than necessary. > > Let's not repeat the error of trying to be too clever from the beginning > as we did previously. This are is just too subtle and obviously error > prone. > Something like this? --- mm/mmap.c | 41 +++++++++++++++++++++++------------------ mm/oom_kill.c | 29 +++++++++++------------------ 2 files changed, 34 insertions(+), 36 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 188f195..3edb7da 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3015,6 +3015,28 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); + if (unlikely(mm_is_oom_victim(mm))) { + /* + * Tell oom_reap_task() not to start reaping this mm. + * + * oom_reap_task() depends on a stable VM_LOCKED flag to + * indicate it should not unmap during munlock_vma_pages_all(). + * + * Since MMF_UNSTABLE is set before calling down_write(), + * oom_reap_task() which calls down_read() before testing + * MMF_UNSTABLE will not run on this mm after up_write(). + * + * mm_is_oom_victim() cannot be set from under us because + * victim->mm is already set to NULL under task_lock before + * calling mmput() and victim->signal->oom_mm is set by the oom + * killer only if victim->mm is non-NULL while holding + * task_lock(). + */ + set_bit(MMF_UNSTABLE, &mm->flags); + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } + if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -3036,26 +3058,9 @@ void exit_mmap(struct mm_struct *mm) /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - - if (unlikely(mm_is_oom_victim(mm))) { - /* - * Wait for oom_reap_task() to stop working on this - * mm. Because MMF_OOM_SKIP is already set before - * calling down_read(), oom_reap_task() will not run - * on this "mm" post up_write(). - * - * mm_is_oom_victim() cannot be set from under us - * either because victim->mm is already set to NULL - * under task_lock before calling mmput and oom_mm is - * set not NULL by the OOM killer only if victim->mm - * is found not NULL while holding the task_lock. - */ - set_bit(MMF_OOM_SKIP, &mm->flags); - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); - } free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); + set_bit(MMF_OOM_SKIP, &mm->flags); /* * Walk the list again, actually closing and freeing it, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ff992fa..1fef1b6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -510,25 +510,16 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) /* * If the mm has invalidate_{start,end}() notifiers that could block, + * or if the mm is in exit_mmap() which has unpredictable dependencies, * sleep to give the oom victim some more time. * TODO: we really want to get rid of this ugly hack and make sure that * notifiers cannot block for unbounded amount of time */ - if (mm_has_blockable_invalidate_notifiers(mm)) { - up_read(&mm->mmap_sem); - schedule_timeout_idle(HZ); - goto unlock_oom; - } - - /* - * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't - * work on the mm anymore. The check for MMF_OOM_SKIP must run - * under mmap_sem for reading because it serializes against the - * down_write();up_write() cycle in exit_mmap(). - */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_has_blockable_invalidate_notifiers(mm) || + test_bit(MMF_UNSTABLE, &mm->flags)) { up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); + schedule_timeout_idle(HZ); goto unlock_oom; } @@ -590,11 +581,9 @@ static void oom_reap_task(struct task_struct *tsk) while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); - if (attempts <= MAX_OOM_REAP_RETRIES || - test_bit(MMF_OOM_SKIP, &mm->flags)) + if (test_bit(MMF_UNSTABLE, &mm->flags)) goto done; - pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); debug_show_all_locks(); @@ -603,8 +592,12 @@ static void oom_reap_task(struct task_struct *tsk) tsk->oom_reaper_list = NULL; /* - * Hide this mm from OOM killer because it has been either reaped or - * somebody can't call up_write(mmap_sem). + * Hide this mm from the OOM killer because: + * the OOM reaper completed reaping + * or + * exit_mmap() told the OOM reaper not to start reaping + * or + * neither exit_mmap() nor the OOM reaper started reaping */ set_bit(MMF_OOM_SKIP, &mm->flags); -- 1.8.3.1