David Rientjes wrote: > On Wed, 6 Dec 2017, Tetsuo Handa wrote: > > Also, I don't know what exit_mmap() is doing but I think that there is a > > possibility that the OOM reaper tries to reclaim mlocked pages as soon as > > exit_mmap() cleared VM_LOCKED flag by calling munlock_vma_pages_all(). > > > > if (mm->locked_vm) { > > vma = mm->mmap; > > while (vma) { > > if (vma->vm_flags & VM_LOCKED) > > munlock_vma_pages_all(vma); > > vma = vma->vm_next; > > } > > } > > > > Yes, that looks possible as well, although the problem I have reported can > happen with or without mlock. Did you find this by code inspection or > have you experienced runtime problems with it? By code inspection. > > I think this argues to do MMF_REAPING-style behavior at the beginning of > exit_mmap() and avoid reaping all together once we have reached that > point. There are no more users of the mm and we are in the process of > tearing it down, I'm not sure that the oom reaper should be in the > business with trying to interfere with that. Or are there actual bug > reports where an oom victim gets wedged while in exit_mmap() prior to > releasing its memory? If our assumption is that the OOM reaper can reclaim majority of OOM victim's memory via victim's ->signal->oom_mm, what will be wrong with simply reverting 212925802454672e ("mm: oom: let oom_reap_task and exit_mmap run concurrently") and replace mmgrab()/mmdrop_async() with mmget()/mmput_async() so that the OOM reaper no longer need to worry about tricky __mmput() behavior (like shown below) ? ---------- kernel/fork.c | 17 ++++++++++++----- mm/mmap.c | 17 ----------------- mm/oom_kill.c | 21 +++++++-------------- 3 files changed, 19 insertions(+), 36 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 432eadf..018a857 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -394,12 +394,18 @@ static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); - /* - * __mmdrop is not safe to call from softirq context on x86 due to - * pgd_dtor so postpone it to the async context - */ - if (sig->oom_mm) + if (sig->oom_mm) { +#ifdef CONFIG_MMU + mmput_async(sig->oom_mm); +#else + /* + * There might be archtectures where calling __mmdrop() from + * softirq context is not safe. Thus, postpone it to the async + * context. + */ mmdrop_async(sig->oom_mm); +#endif + } kmem_cache_free(signal_cachep, sig); } @@ -931,6 +937,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + set_bit(MMF_OOM_SKIP, &mm->flags); mmdrop(mm); } diff --git a/mm/mmap.c b/mm/mmap.c index a4d5468..fafaf06 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3019,23 +3019,6 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - set_bit(MMF_OOM_SKIP, &mm->flags); - if (unlikely(tsk_is_oom_victim(current))) { - /* - * Wait for oom_reap_task() to stop working on this - * mm. Because MMF_OOM_SKIP is already set before - * calling down_read(), oom_reap_task() will not run - * on this "mm" post up_write(). - * - * tsk_is_oom_victim() cannot be set from under us - * either because current->mm is already set to NULL - * under task_lock before calling mmput and oom_mm is - * set not NULL by the OOM killer only if current->mm - * is found not NULL while holding the task_lock. - */ - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); - } free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c957be3..eb2a005 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -528,18 +528,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) goto unlock_oom; } - /* - * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't - * work on the mm anymore. The check for MMF_OOM_SKIP must run - * under mmap_sem for reading because it serializes against the - * down_write();up_write() cycle in exit_mmap(). - */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { - up_read(&mm->mmap_sem); - trace_skip_task_reaping(tsk->pid); - goto unlock_oom; - } - trace_start_task_reaping(tsk->pid); /* @@ -683,8 +671,13 @@ static void mark_oom_victim(struct task_struct *tsk) return; /* oom_mm is bound to the signal struct life time. */ - if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) - mmgrab(tsk->signal->oom_mm); + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { +#ifdef CONFIG_MMU + mmget(mm); +#else + mmgrab(mm); +#endif + } /* * Make sure that the task is woken up from uninterruptible sleep ---------- If holding the address space when there are so many victim's mm waiting for the OOM reaper to reclaim is considered dangerous, I think we can try direct OOM reaping (like untested patch shown below) in order to reclaim first-blocked first-reaped basis (and also serve as a mitigation for race caused by removing oom_lock from the OOM reaper). ---------- include/linux/mm_types.h | 3 ++ include/linux/sched.h | 3 -- mm/oom_kill.c | 132 ++++++++++++++--------------------------------- 3 files changed, 43 insertions(+), 95 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cfd0ac4..068119b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -501,6 +501,9 @@ struct mm_struct { atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; +#ifdef CONFIG_MMU + unsigned long oom_reap_started; +#endif #if IS_ENABLED(CONFIG_HMM) /* HMM needs to track a few things per mm */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a2709d2..d63b599 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1081,9 +1081,6 @@ struct task_struct { unsigned long task_state_change; #endif int pagefault_disabled; -#ifdef CONFIG_MMU - struct task_struct *oom_reaper_list; -#endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3b0d0fe..a9f8bae 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -309,9 +309,14 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; } +#ifdef CONFIG_MMU +static void oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm); +#endif + static int oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; + struct mm_struct *mm; unsigned long points; if (oom_unkillable_task(task, NULL, oc->nodemask)) @@ -324,7 +329,8 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) * any memory is quite low. */ if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { - if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) + mm = task->signal->oom_mm; + if (test_bit(MMF_OOM_SKIP, &mm->flags)) goto next; goto abort; } @@ -357,6 +363,15 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) if (oc->chosen) put_task_struct(oc->chosen); oc->chosen = (void *)-1UL; +#ifdef CONFIG_MMU + get_task_struct(task); + if (!is_memcg_oom(oc)) + rcu_read_unlock(); + oom_reap_task_mm(task, mm); + put_task_struct(task); + if (!is_memcg_oom(oc)) + rcu_read_lock(); +#endif return 1; } @@ -474,23 +489,14 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) return false; } - #ifdef CONFIG_MMU -/* - * OOM Reaper kernel thread which tries to reap the memory used by the OOM - * victim (if that is possible) to help the OOM killer to move on. - */ -static struct task_struct *oom_reaper_th; -static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -static struct task_struct *oom_reaper_list; -static DEFINE_SPINLOCK(oom_reaper_lock); - static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; bool ret = true; + trace_wake_reaper(tsk->pid); if (!down_read_trylock(&mm->mmap_sem)) { ret = false; trace_skip_task_reaping(tsk->pid); @@ -507,8 +513,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * mmu_notifier_invalidate_range_{start,end} around unmap_page_range */ if (mm_has_notifiers(mm)) { + ret = false; up_read(&mm->mmap_sem); - schedule_timeout_idle(HZ); goto unlock_oom; } @@ -567,82 +573,22 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) return ret; } -#define MAX_OOM_REAP_RETRIES 10 -static void oom_reap_task(struct task_struct *tsk) -{ - int attempts = 0; - struct mm_struct *mm = tsk->signal->oom_mm; - - /* Retry the down_read_trylock(mmap_sem) a few times */ - while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) - schedule_timeout_idle(HZ/10); - - if (attempts <= MAX_OOM_REAP_RETRIES) - goto done; - - - pr_info("oom_reaper: unable to reap pid:%d (%s)\n", - task_pid_nr(tsk), tsk->comm); - debug_show_all_locks(); - -done: - tsk->oom_reaper_list = NULL; - - /* - * Hide this mm from OOM killer because it has been either reaped or - * somebody can't call up_write(mmap_sem). - */ - set_bit(MMF_OOM_SKIP, &mm->flags); - - /* Drop a reference taken by wake_oom_reaper */ - put_task_struct(tsk); -} - -static int oom_reaper(void *unused) +static void oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { - while (true) { - struct task_struct *tsk = NULL; - - wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); - spin_lock(&oom_reaper_lock); - if (oom_reaper_list != NULL) { - tsk = oom_reaper_list; - oom_reaper_list = tsk->oom_reaper_list; + if (__oom_reap_task_mm(tsk, mm)) + /* Hide this mm from OOM killer because we reaped it. */ + set_bit(MMF_OOM_SKIP, &mm->flags); + else if (!mm->oom_reap_started) + mm->oom_reap_started = jiffies; + else if (time_after(jiffies, mm->oom_reap_started + HZ)) { + if (!mm_has_notifiers(mm)) { + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + debug_show_all_locks(); } - spin_unlock(&oom_reaper_lock); - - if (tsk) - oom_reap_task(tsk); + /* Hide this mm from OOM killer because we can't reap. */ + set_bit(MMF_OOM_SKIP, &mm->flags); } - - return 0; -} - -static void wake_oom_reaper(struct task_struct *tsk) -{ - /* tsk is already queued? */ - if (tsk == oom_reaper_list || tsk->oom_reaper_list) - return; - - get_task_struct(tsk); - - spin_lock(&oom_reaper_lock); - tsk->oom_reaper_list = oom_reaper_list; - oom_reaper_list = tsk; - spin_unlock(&oom_reaper_lock); - trace_wake_reaper(tsk->pid); - wake_up(&oom_reaper_wait); -} - -static int __init oom_init(void) -{ - oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); - return 0; -} -subsys_initcall(oom_init) -#else -static inline void wake_oom_reaper(struct task_struct *tsk) -{ } #endif /* CONFIG_MMU */ @@ -825,7 +771,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message) unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - bool can_oom_reap = true; /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -835,7 +780,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message) task_lock(p); if (task_will_free_mem(p)) { mark_oom_victim(p); - wake_oom_reaper(p); task_unlock(p); put_task_struct(p); return; @@ -924,7 +868,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message) if (same_thread_group(p, victim)) continue; if (is_global_init(p)) { - can_oom_reap = false; set_bit(MMF_OOM_SKIP, &mm->flags); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", task_pid_nr(victim), victim->comm, @@ -941,9 +884,15 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } rcu_read_unlock(); - if (can_oom_reap) - wake_oom_reaper(victim); - +#ifdef CONFIG_MMU + /* + * sysctl_oom_kill_allocating_task case could get stuck because + * select_bad_process() which calls oom_reap_task_mm() is not called. + */ + if (sysctl_oom_kill_allocating_task && + !test_bit(MMF_OOM_SKIP, &mm->flags)) + oom_reap_task_mm(victim, mm); +#endif mmdrop(mm); put_task_struct(victim); } @@ -1019,7 +968,6 @@ bool out_of_memory(struct oom_control *oc) */ if (task_will_free_mem(current)) { mark_oom_victim(current); - wake_oom_reaper(current); return true; } ---------- -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>