Re: Multiple oom_reaper BUGs: unmap_page_range racing with exit_mmap

Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> · Wed, 6 Dec 2017 20:37:06 +0900

David Rientjes wrote:
> On Wed, 6 Dec 2017, Tetsuo Handa wrote:
> > Also, I don't know what exit_mmap() is doing but I think that there is a
> > possibility that the OOM reaper tries to reclaim mlocked pages as soon as
> > exit_mmap() cleared VM_LOCKED flag by calling munlock_vma_pages_all().
> > 
> > 	if (mm->locked_vm) {
> > 		vma = mm->mmap;
> > 		while (vma) {
> > 			if (vma->vm_flags & VM_LOCKED)
> > 				munlock_vma_pages_all(vma);
> > 			vma = vma->vm_next;
> > 		}
> > 	}
> > 
> 
> Yes, that looks possible as well, although the problem I have reported can 
> happen with or without mlock.  Did you find this by code inspection or 
> have you experienced runtime problems with it?

By code inspection.

> 
> I think this argues to do MMF_REAPING-style behavior at the beginning of 
> exit_mmap() and avoid reaping all together once we have reached that 
> point.  There are no more users of the mm and we are in the process of 
> tearing it down, I'm not sure that the oom reaper should be in the 
> business with trying to interfere with that.  Or are there actual bug 
> reports where an oom victim gets wedged while in exit_mmap() prior to 
> releasing its memory?

If our assumption is that the OOM reaper can reclaim majority of OOM
victim's memory via victim's ->signal->oom_mm, what will be wrong with
simply reverting 212925802454672e ("mm: oom: let oom_reap_task and
exit_mmap run concurrently") and replace mmgrab()/mmdrop_async() with
mmget()/mmput_async() so that the OOM reaper no longer need to worry
about tricky __mmput() behavior (like shown below) ?

----------
 kernel/fork.c |   17 ++++++++++++-----
 mm/mmap.c     |   17 -----------------
 mm/oom_kill.c |   21 +++++++--------------
 3 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 432eadf..018a857 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -394,12 +394,18 @@ static inline void free_signal_struct(struct signal_struct *sig)
 {
 	taskstats_tgid_free(sig);
 	sched_autogroup_exit(sig);
-	/*
-	 * __mmdrop is not safe to call from softirq context on x86 due to
-	 * pgd_dtor so postpone it to the async context
-	 */
-	if (sig->oom_mm)
+	if (sig->oom_mm) {
+#ifdef CONFIG_MMU
+		mmput_async(sig->oom_mm);
+#else
+		/*
+		 * There might be archtectures where calling __mmdrop() from
+		 * softirq context is not safe. Thus, postpone it to the async
+		 * context.
+		 */
 		mmdrop_async(sig->oom_mm);
+#endif
+	}
 	kmem_cache_free(signal_cachep, sig);
 }
 
@@ -931,6 +937,7 @@ static inline void __mmput(struct mm_struct *mm)
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
+	set_bit(MMF_OOM_SKIP, &mm->flags);
 	mmdrop(mm);
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index a4d5468..fafaf06 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3019,23 +3019,6 @@ void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
 
-	set_bit(MMF_OOM_SKIP, &mm->flags);
-	if (unlikely(tsk_is_oom_victim(current))) {
-		/*
-		 * Wait for oom_reap_task() to stop working on this
-		 * mm. Because MMF_OOM_SKIP is already set before
-		 * calling down_read(), oom_reap_task() will not run
-		 * on this "mm" post up_write().
-		 *
-		 * tsk_is_oom_victim() cannot be set from under us
-		 * either because current->mm is already set to NULL
-		 * under task_lock before calling mmput and oom_mm is
-		 * set not NULL by the OOM killer only if current->mm
-		 * is found not NULL while holding the task_lock.
-		 */
-		down_write(&mm->mmap_sem);
-		up_write(&mm->mmap_sem);
-	}
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
 	tlb_finish_mmu(&tlb, 0, -1);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c957be3..eb2a005 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -528,18 +528,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 		goto unlock_oom;
 	}
 
-	/*
-	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
-	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
-	 * under mmap_sem for reading because it serializes against the
-	 * down_write();up_write() cycle in exit_mmap().
-	 */
-	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
-		up_read(&mm->mmap_sem);
-		trace_skip_task_reaping(tsk->pid);
-		goto unlock_oom;
-	}
-
 	trace_start_task_reaping(tsk->pid);
 
 	/*
@@ -683,8 +671,13 @@ static void mark_oom_victim(struct task_struct *tsk)
 		return;
 
 	/* oom_mm is bound to the signal struct life time. */
-	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
-		mmgrab(tsk->signal->oom_mm);
+	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
+#ifdef CONFIG_MMU
+		mmget(mm);
+#else
+		mmgrab(mm);
+#endif
+	}
 
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
----------

If holding the address space when there are so many victim's mm waiting for
the OOM reaper to reclaim is considered dangerous, I think we can try direct
OOM reaping (like untested patch shown below) in order to reclaim first-blocked
first-reaped basis (and also serve as a mitigation for race caused by removing
oom_lock from the OOM reaper).

----------
 include/linux/mm_types.h |   3 ++
 include/linux/sched.h    |   3 --
 mm/oom_kill.c            | 132 ++++++++++++++---------------------------------
 3 files changed, 43 insertions(+), 95 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index cfd0ac4..068119b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -501,6 +501,9 @@ struct mm_struct {
 	atomic_long_t hugetlb_usage;
 #endif
 	struct work_struct async_put_work;
+#ifdef CONFIG_MMU
+	unsigned long oom_reap_started;
+#endif
 
 #if IS_ENABLED(CONFIG_HMM)
 	/* HMM needs to track a few things per mm */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a2709d2..d63b599 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1081,9 +1081,6 @@ struct task_struct {
 	unsigned long			task_state_change;
 #endif
 	int				pagefault_disabled;
-#ifdef CONFIG_MMU
-	struct task_struct		*oom_reaper_list;
-#endif
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
 #endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3b0d0fe..a9f8bae 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -309,9 +309,14 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
 	return CONSTRAINT_NONE;
 }
 
+#ifdef CONFIG_MMU
+static void oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm);
+#endif
+
 static int oom_evaluate_task(struct task_struct *task, void *arg)
 {
 	struct oom_control *oc = arg;
+	struct mm_struct *mm;
 	unsigned long points;
 
 	if (oom_unkillable_task(task, NULL, oc->nodemask))
@@ -324,7 +329,8 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	 * any memory is quite low.
 	 */
 	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
-		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
+		mm = task->signal->oom_mm;
+		if (test_bit(MMF_OOM_SKIP, &mm->flags))
 			goto next;
 		goto abort;
 	}
@@ -357,6 +363,15 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	if (oc->chosen)
 		put_task_struct(oc->chosen);
 	oc->chosen = (void *)-1UL;
+#ifdef CONFIG_MMU
+	get_task_struct(task);
+	if (!is_memcg_oom(oc))
+		rcu_read_unlock();
+	oom_reap_task_mm(task, mm);
+	put_task_struct(task);
+	if (!is_memcg_oom(oc))
+		rcu_read_lock();
+#endif
 	return 1;
 }
 
@@ -474,23 +489,14 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
 	return false;
 }
 
-
 #ifdef CONFIG_MMU
-/*
- * OOM Reaper kernel thread which tries to reap the memory used by the OOM
- * victim (if that is possible) to help the OOM killer to move on.
- */
-static struct task_struct *oom_reaper_th;
-static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
-static struct task_struct *oom_reaper_list;
-static DEFINE_SPINLOCK(oom_reaper_lock);
-
 static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
 	struct mmu_gather tlb;
 	struct vm_area_struct *vma;
 	bool ret = true;
 
+	trace_wake_reaper(tsk->pid);
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		ret = false;
 		trace_skip_task_reaping(tsk->pid);
@@ -507,8 +513,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	 * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
 	 */
 	if (mm_has_notifiers(mm)) {
+		ret = false;
 		up_read(&mm->mmap_sem);
-		schedule_timeout_idle(HZ);
 		goto unlock_oom;
 	}
 
@@ -567,82 +573,22 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	return ret;
 }
 
-#define MAX_OOM_REAP_RETRIES 10
-static void oom_reap_task(struct task_struct *tsk)
-{
-	int attempts = 0;
-	struct mm_struct *mm = tsk->signal->oom_mm;
-
-	/* Retry the down_read_trylock(mmap_sem) a few times */
-	while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
-		schedule_timeout_idle(HZ/10);
-
-	if (attempts <= MAX_OOM_REAP_RETRIES)
-		goto done;
-
-
-	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
-		task_pid_nr(tsk), tsk->comm);
-	debug_show_all_locks();
-
-done:
-	tsk->oom_reaper_list = NULL;
-
-	/*
-	 * Hide this mm from OOM killer because it has been either reaped or
-	 * somebody can't call up_write(mmap_sem).
-	 */
-	set_bit(MMF_OOM_SKIP, &mm->flags);
-
-	/* Drop a reference taken by wake_oom_reaper */
-	put_task_struct(tsk);
-}
-
-static int oom_reaper(void *unused)
+static void oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
-	while (true) {
-		struct task_struct *tsk = NULL;
-
-		wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
-		spin_lock(&oom_reaper_lock);
-		if (oom_reaper_list != NULL) {
-			tsk = oom_reaper_list;
-			oom_reaper_list = tsk->oom_reaper_list;
+	if (__oom_reap_task_mm(tsk, mm))
+		/* Hide this mm from OOM killer because we reaped it. */
+		set_bit(MMF_OOM_SKIP, &mm->flags);
+	else if (!mm->oom_reap_started)
+		mm->oom_reap_started = jiffies;
+	else if (time_after(jiffies, mm->oom_reap_started + HZ)) {
+		if (!mm_has_notifiers(mm)) {
+			pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
+				task_pid_nr(tsk), tsk->comm);
+			debug_show_all_locks();
 		}
-		spin_unlock(&oom_reaper_lock);
-
-		if (tsk)
-			oom_reap_task(tsk);
+		/* Hide this mm from OOM killer because we can't reap. */
+		set_bit(MMF_OOM_SKIP, &mm->flags);
 	}
-
-	return 0;
-}
-
-static void wake_oom_reaper(struct task_struct *tsk)
-{
-	/* tsk is already queued? */
-	if (tsk == oom_reaper_list || tsk->oom_reaper_list)
-		return;
-
-	get_task_struct(tsk);
-
-	spin_lock(&oom_reaper_lock);
-	tsk->oom_reaper_list = oom_reaper_list;
-	oom_reaper_list = tsk;
-	spin_unlock(&oom_reaper_lock);
-	trace_wake_reaper(tsk->pid);
-	wake_up(&oom_reaper_wait);
-}
-
-static int __init oom_init(void)
-{
-	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
-	return 0;
-}
-subsys_initcall(oom_init)
-#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
-{
 }
 #endif /* CONFIG_MMU */
 
@@ -825,7 +771,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	unsigned int victim_points = 0;
 	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
 					      DEFAULT_RATELIMIT_BURST);
-	bool can_oom_reap = true;
 
 	/*
 	 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -835,7 +780,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	task_lock(p);
 	if (task_will_free_mem(p)) {
 		mark_oom_victim(p);
-		wake_oom_reaper(p);
 		task_unlock(p);
 		put_task_struct(p);
 		return;
@@ -924,7 +868,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 		if (same_thread_group(p, victim))
 			continue;
 		if (is_global_init(p)) {
-			can_oom_reap = false;
 			set_bit(MMF_OOM_SKIP, &mm->flags);
 			pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
 					task_pid_nr(victim), victim->comm,
@@ -941,9 +884,15 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	}
 	rcu_read_unlock();
 
-	if (can_oom_reap)
-		wake_oom_reaper(victim);
-
+#ifdef CONFIG_MMU
+	/*
+	 * sysctl_oom_kill_allocating_task case could get stuck because
+	 * select_bad_process() which calls oom_reap_task_mm() is not called.
+	 */
+	if (sysctl_oom_kill_allocating_task &&
+	    !test_bit(MMF_OOM_SKIP, &mm->flags))
+		oom_reap_task_mm(victim, mm);
+#endif
 	mmdrop(mm);
 	put_task_struct(victim);
 }
@@ -1019,7 +968,6 @@ bool out_of_memory(struct oom_control *oc)
 	 */
 	if (task_will_free_mem(current)) {
 		mark_oom_victim(current);
-		wake_oom_reaper(current);
 		return true;
 	}
 
----------

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>