Commit e2fe14564d3316d1 ("oom_reaper: close race with exiting task") guarded whole OOM reaping operations using oom_lock. But there was no need to guard whole operations. We needed to guard only setting of MMF_OOM_REAPED flag because get_page_from_freelist() in __alloc_pages_may_oom() is called with oom_lock held. If we change to guard only setting of MMF_OOM_SKIP flag, the OOM reaper can start reaping operations as soon as wake_oom_reaper() is called. But since setting of MMF_OOM_SKIP flag at __mmput() is not guarded with oom_lock, guarding only the OOM reaper side is not sufficient. If we change the OOM killer side to ignore MMF_OOM_SKIP flag once, there is no need to guard setting of MMF_OOM_SKIP flag, and we can guarantee a chance to call get_page_from_freelist() in __alloc_pages_may_oom() without depending on oom_lock serialization. This patch makes MMF_OOM_SKIP act as if MMF_OOM_REAPED, and adds a new flag which acts as if MMF_OOM_SKIP, in order to close both race window (the OOM reaper side and __mmput() side) without using oom_lock. Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> --- include/linux/mm_types.h | 1 + mm/oom_kill.c | 42 +++++++++++++++--------------------------- 2 files changed, 16 insertions(+), 27 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ff15181..3184b7a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -495,6 +495,7 @@ struct mm_struct { */ bool tlb_flush_pending; #endif + bool oom_killer_synchronized; struct uprobes_state uprobes_state; #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9e8b4f0..1710133 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -300,11 +300,17 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves unless * the task has MMF_OOM_SKIP because chances that it would release - * any memory is quite low. + * any memory is quite low. But ignore MMF_OOM_SKIP once, for there is + * still possibility that get_page_from_freelist() with oom_lock held + * succeeds because MMF_OOM_SKIP is set without oom_lock held. */ if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { - if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) + struct mm_struct *mm = task->signal->oom_mm; + + if (mm->oom_killer_synchronized) goto next; + if (test_bit(MMF_OOM_SKIP, &mm->flags)) + mm->oom_killer_synchronized = true; goto abort; } @@ -470,28 +476,10 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; - bool ret = true; - - /* - * We have to make sure to not race with the victim exit path - * and cause premature new oom victim selection: - * __oom_reap_task_mm exit_mm - * mmget_not_zero - * mmput - * atomic_dec_and_test - * exit_oom_victim - * [...] - * out_of_memory - * select_bad_process - * # no TIF_MEMDIE task selects new victim - * unmap_page_range # frees some memory - */ - mutex_lock(&oom_lock); if (!down_read_trylock(&mm->mmap_sem)) { - ret = false; trace_skip_task_reaping(tsk->pid); - goto unlock_oom; + return false; } /* @@ -502,7 +490,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) if (!mmget_not_zero(mm)) { up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); - goto unlock_oom; + return true; } trace_start_task_reaping(tsk->pid); @@ -549,9 +537,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) */ mmput_async(mm); trace_finish_task_reaping(tsk->pid); -unlock_oom: - mutex_unlock(&oom_lock); - return ret; + return true; } #define MAX_OOM_REAP_RETRIES 10 @@ -661,8 +647,10 @@ static void mark_oom_victim(struct task_struct *tsk) return; /* oom_mm is bound to the signal struct life time. */ - if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) - mmgrab(tsk->signal->oom_mm); + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { + mmgrab(mm); + mm->oom_killer_synchronized = false; + } /* * Make sure that the task is woken up from uninterruptible sleep -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>