Re: [patch v2] mm, oom: fix concurrent munlock and oom reaper unmap

Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> · Thu, 19 Apr 2018 20:51:45 +0900

Michal Hocko wrote:
> > We need to teach the OOM reaper stop reaping as soon as entering exit_mmap().
> > Maybe let the OOM reaper poll for progress (e.g. none of get_mm_counter(mm, *)
> > decreased for last 1 second) ?
> 
> Can we start simple and build a more elaborate heuristics on top _please_?
> In other words holding the mmap_sem for write for oom victims in
> exit_mmap should handle the problem. We can then enhance this to probe
> for progress or any other clever tricks if we find out that the race
> happens too often and we kill more than necessary.
> 
> Let's not repeat the error of trying to be too clever from the beginning
> as we did previously. This are is just too subtle and obviously error
> prone.
> 
Something like this?

---
 mm/mmap.c     | 41 +++++++++++++++++++++++------------------
 mm/oom_kill.c | 29 +++++++++++------------------
 2 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 188f195..3edb7da 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3015,6 +3015,28 @@ void exit_mmap(struct mm_struct *mm)
 	/* mm's last user has gone, and its about to be pulled down */
 	mmu_notifier_release(mm);
 
+	if (unlikely(mm_is_oom_victim(mm))) {
+		/*
+		 * Tell oom_reap_task() not to start reaping this mm.
+		 *
+		 * oom_reap_task() depends on a stable VM_LOCKED flag to
+		 * indicate it should not unmap during munlock_vma_pages_all().
+		 *
+		 * Since MMF_UNSTABLE is set before calling down_write(),
+		 * oom_reap_task() which calls down_read() before testing
+		 * MMF_UNSTABLE will not run on this mm after up_write().
+		 *
+		 * mm_is_oom_victim() cannot be set from under us because
+		 * victim->mm is already set to NULL under task_lock before
+		 * calling mmput() and victim->signal->oom_mm is set by the oom
+		 * killer only if victim->mm is non-NULL while holding
+		 * task_lock().
+		 */
+		set_bit(MMF_UNSTABLE, &mm->flags);
+		down_write(&mm->mmap_sem);
+		up_write(&mm->mmap_sem);
+	}
+
 	if (mm->locked_vm) {
 		vma = mm->mmap;
 		while (vma) {
@@ -3036,26 +3058,9 @@ void exit_mmap(struct mm_struct *mm)
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
-
-	if (unlikely(mm_is_oom_victim(mm))) {
-		/*
-		 * Wait for oom_reap_task() to stop working on this
-		 * mm. Because MMF_OOM_SKIP is already set before
-		 * calling down_read(), oom_reap_task() will not run
-		 * on this "mm" post up_write().
-		 *
-		 * mm_is_oom_victim() cannot be set from under us
-		 * either because victim->mm is already set to NULL
-		 * under task_lock before calling mmput and oom_mm is
-		 * set not NULL by the OOM killer only if victim->mm
-		 * is found not NULL while holding the task_lock.
-		 */
-		set_bit(MMF_OOM_SKIP, &mm->flags);
-		down_write(&mm->mmap_sem);
-		up_write(&mm->mmap_sem);
-	}
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
 	tlb_finish_mmu(&tlb, 0, -1);
+	set_bit(MMF_OOM_SKIP, &mm->flags);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ff992fa..1fef1b6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -510,25 +510,16 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 
 	/*
 	 * If the mm has invalidate_{start,end}() notifiers that could block,
+	 * or if the mm is in exit_mmap() which has unpredictable dependencies,
 	 * sleep to give the oom victim some more time.
 	 * TODO: we really want to get rid of this ugly hack and make sure that
 	 * notifiers cannot block for unbounded amount of time
 	 */
-	if (mm_has_blockable_invalidate_notifiers(mm)) {
-		up_read(&mm->mmap_sem);
-		schedule_timeout_idle(HZ);
-		goto unlock_oom;
-	}
-
-	/*
-	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
-	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
-	 * under mmap_sem for reading because it serializes against the
-	 * down_write();up_write() cycle in exit_mmap().
-	 */
-	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+	if (mm_has_blockable_invalidate_notifiers(mm) ||
+	    test_bit(MMF_UNSTABLE, &mm->flags)) {
 		up_read(&mm->mmap_sem);
 		trace_skip_task_reaping(tsk->pid);
+		schedule_timeout_idle(HZ);
 		goto unlock_oom;
 	}
 
@@ -590,11 +581,9 @@ static void oom_reap_task(struct task_struct *tsk)
 	while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
 		schedule_timeout_idle(HZ/10);
 
-	if (attempts <= MAX_OOM_REAP_RETRIES ||
-	    test_bit(MMF_OOM_SKIP, &mm->flags))
+	if (test_bit(MMF_UNSTABLE, &mm->flags))
 		goto done;
 
-
 	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
 		task_pid_nr(tsk), tsk->comm);
 	debug_show_all_locks();
@@ -603,8 +592,12 @@ static void oom_reap_task(struct task_struct *tsk)
 	tsk->oom_reaper_list = NULL;
 
 	/*
-	 * Hide this mm from OOM killer because it has been either reaped or
-	 * somebody can't call up_write(mmap_sem).
+	 * Hide this mm from the OOM killer because:
+	 *   the OOM reaper completed reaping
+	 * or
+	 *   exit_mmap() told the OOM reaper not to start reaping
+	 * or
+	 *   neither exit_mmap() nor the OOM reaper started reaping
 	 */
 	set_bit(MMF_OOM_SKIP, &mm->flags);
 
-- 
1.8.3.1