Michal Hocko wrote: > > I'm trying to update select_bad_process() to use for_each_process() rather than > > for_each_process_thread(). If we can do it, I think we can use is_oom_victim() > > and respond to your concern > > 3a5dda7a17cf ("oom: prevent unnecessary oom kills or kernel panics"). I > would really recommend to look through the history (git blame is your > friend) before suggesting any changes in this area. The code is really > subtle and many times for good reasons. I think the code is too subtle to touch. I wish I were able to propose as simple OOM killing as shown below. (1) Do not clear TIF_MEMDIE until TASK_DEAD. This should help cases where memory allocations are needed after exit_mm(). (2) Do not wait TIF_MEMDIE task forever using simple timeout. This should help cases where the victim task is unable to make forward progress. (3) Do not allow ALLOC_NO_WATERMARKS by TIF_MEMDIE tasks. This should help avoid depletion of memory reserves. (4) Allow any SIGKILL or PF_EXITING tasks access part of memory reserves. This should help exiting tasks to make forward progress. (5) Set TIF_MEMDIE to all threads chosen by the OOM killer. This should help reduce mmap_sem problem. (6) Select next task to set TIF_MEMDIE after timeout rather than give up TIF_MEMDIE task's memory allocations. This should help reduce possibility of unexpected failures by e.g. btrfs's BUG_ON() trap. (7) Allow GFP_KILLABLE. This should help quicker termination. diff --git a/include/linux/oom.h b/include/linux/oom.h index 45993b8..87a004e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -70,7 +70,7 @@ static inline bool oom_task_origin(const struct task_struct *p) return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); } -extern void mark_oom_victim(struct task_struct *tsk); +extern bool mark_oom_victim(struct task_struct *tsk); extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, diff --git a/kernel/exit.c b/kernel/exit.c index fd90195..d4ff477 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -434,8 +434,6 @@ static void exit_mm(struct task_struct *tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); - if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(tsk); } static struct task_struct *find_alive_thread(struct task_struct *p) @@ -825,6 +823,7 @@ void do_exit(long code) /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ + exit_oom_victim(tsk); schedule(); BUG(); /* Avoid "noreturn function does return". */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae8b81c..5be5a85 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1258,10 +1258,9 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ - if (fatal_signal_pending(current) || task_will_free_mem(current)) { - mark_oom_victim(current); + if ((fatal_signal_pending(current) || task_will_free_mem(current)) && + mark_oom_victim(current)) goto unlock; - } check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); totalpages = mem_cgroup_get_limit(memcg) ? : 1; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d7bb9c1..6e7f5e4a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -50,6 +50,11 @@ int sysctl_oom_dump_tasks = 1; DEFINE_MUTEX(oom_lock); +static void oomkiller_reset(unsigned long arg) +{ +} +static DEFINE_TIMER(oomkiller_victim_wait_timer, oomkiller_reset, 0, 0); + #ifdef CONFIG_NUMA /** * has_intersects_mems_allowed() - check task eligiblity for kill @@ -278,9 +283,10 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves. */ - if (test_tsk_thread_flag(task, TIF_MEMDIE)) { + if (test_tsk_thread_flag(task, TIF_MEMDIE) && task->state != TASK_DEAD) { if (!is_sysrq_oom(oc)) - return OOM_SCAN_ABORT; + return timer_pending(&oomkiller_victim_wait_timer) ? + OOM_SCAN_ABORT : OOM_SCAN_CONTINUE; } if (!task->mm) return OOM_SCAN_CONTINUE; @@ -292,9 +298,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && !is_sysrq_oom(oc)) - return OOM_SCAN_ABORT; - return OOM_SCAN_OK; } @@ -584,12 +587,12 @@ static void wake_oom_reaper(struct task_struct *mm) * Has to be called with oom_lock held and never after * oom has been disabled already. */ -void mark_oom_victim(struct task_struct *tsk) +bool mark_oom_victim(struct task_struct *tsk) { WARN_ON(oom_killer_disabled); /* OOM killer might race with memcg OOM */ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) - return; + return false; /* * Make sure that the task is woken up from uninterruptible sleep * if it is frozen because OOM killer wouldn't be able to free @@ -598,6 +601,9 @@ void mark_oom_victim(struct task_struct *tsk) */ __thaw_task(tsk); atomic_inc(&oom_victims); + /* Make sure that we won't wait for this task forever. */ + mod_timer(&oomkiller_victim_wait_timer, jiffies + 5 * HZ); + return true; } /** @@ -689,8 +695,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, * its children or threads, just set TIF_MEMDIE so it can die quickly */ task_lock(p); - if (p->mm && task_will_free_mem(p)) { - mark_oom_victim(p); + if (task_will_free_mem(p) && mark_oom_victim(p)) { task_unlock(p); put_task_struct(p); return; @@ -759,20 +764,16 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, task_unlock(victim); /* - * Kill all user processes sharing victim->mm in other thread groups, if - * any. They don't get access to memory reserves, though, to avoid - * depletion of all memory. This prevents mm->mmap_sem livelock when an - * oom killed thread cannot exit because it requires the semaphore and - * its contended by another thread trying to allocate memory itself. - * That thread will now get access to memory reserves since it has a - * pending fatal signal. + * Kill all user processes sharing victim->mm. We give them access to + * memory reserves because they are expected to terminate immediately. + * Even if memory reserves were depleted, the OOM reaper takes care of + * allowing the OOM killer to choose next victim by marking current + * victim as OOM-unkillable and clearing TIF_MEMDIE. */ rcu_read_lock(); for_each_process(p) { if (!process_shares_mm(p, mm)) continue; - if (same_thread_group(p, victim)) - continue; if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { /* @@ -784,6 +785,12 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; } do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); + for_each_thread(p, t) { + task_lock(t); + if (t->mm) + mark_oom_victim(t); + task_unlock(t); + } } rcu_read_unlock(); @@ -863,15 +870,10 @@ bool out_of_memory(struct oom_control *oc) * If current has a pending SIGKILL or is exiting, then automatically * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. - * - * But don't select if current has already released its mm and cleared - * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur. */ - if (current->mm && - (fatal_signal_pending(current) || task_will_free_mem(current))) { - mark_oom_victim(current); + if ((fatal_signal_pending(current) || task_will_free_mem(current)) && + mark_oom_victim(current)) return true; - } /* * Check if there were limitations on the allocation (only relevant for diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 85e7588..8c9e707 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3065,10 +3065,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_NO_WATERMARKS; else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) alloc_flags |= ALLOC_NO_WATERMARKS; - else if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) + else if (!in_interrupt() && (current->flags & PF_MEMALLOC)) alloc_flags |= ALLOC_NO_WATERMARKS; + else if (!in_interrupt() && (fatal_signal_pending(current) || + (current->flags & PF_EXITING))) + alloc_flags |= ALLOC_HARDER; } #ifdef CONFIG_CMA if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) @@ -3291,10 +3292,6 @@ retry: goto nopage; } - /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) - goto nopage; - /* * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>