Re: cgroup-aware OOM killer, how to move forward

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Roman, will you check this cleanup patch? This patch applies on top of next-20180724.
I assumed that your series do not kill processes which current thread should not
wait for termination.

>From 86ba99fbf73a9eda0df5ee4ae70c075781e83f81 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx>
Date: Tue, 24 Jul 2018 14:00:45 +0900
Subject: [PATCH] mm,oom: Check pending victims earlier in out_of_memory().

The "mm, oom: cgroup-aware OOM killer" patchset introduced INFLIGHT_VICTIM
in order to replace open-coded ((void *)-1UL). But (regarding CONFIG_MMU=y
case) we have a list of inflight OOM victim threads which are connected to
oom_reaper_list. Thus we can check whether there are inflight OOM victims
before starting process/memcg list traversal. Since it is likely that only
few threads are linked to oom_reaper_list, checking all victims' OOM domain
will not matter.

Thus, check whether there are inflight OOM victims before starting
process/memcg list traversal and eliminate the "abort" path.

Note that this patch could temporarily regress CONFIG_MMU=n kernels
because this patch selects same victims rather than waits for victims
if CONFIG_MMU=n. But if it makes difference, we had better revert commit
212925802454672e ("mm: oom: let oom_reap_task and exit_mmap run
concurrently") because that commit removed setting of MMF_OOM_SKIP from
__mmput() based on an assumption that OOM lockup on CONFIG_MMU=n kernels
won't happen (in other words, OOM victims shall terminate immediately).

This patch also mitigates OOM lockup caused by schedule_timeout_killable(1)
with oom_lock held.

Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx>
Cc: Roman Gushchin <guro@xxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
Cc: David Rientjes <rientjes@xxxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
---
 include/linux/memcontrol.h |   9 ++-
 include/linux/oom.h        |   9 +--
 include/linux/sched.h      |   2 +-
 mm/memcontrol.c            |  61 ++++----------------
 mm/oom_kill.c              | 138 ++++++++++++++++++++++-----------------------
 5 files changed, 84 insertions(+), 135 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 088ae04..3c202c8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -420,8 +420,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 				   struct mem_cgroup *,
 				   struct mem_cgroup_reclaim_cookie *);
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
-int mem_cgroup_scan_tasks(struct mem_cgroup *,
-			  int (*)(struct task_struct *, void *), void *);
+void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+			   void (*fn)(struct task_struct *, void *), void *arg);
 
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
@@ -926,10 +926,9 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
 {
 }
 
-static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
-		int (*fn)(struct task_struct *, void *), void *arg)
+static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+		void (*fn)(struct task_struct *, void *), void *arg)
 {
-	return 0;
 }
 
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
diff --git a/include/linux/oom.h b/include/linux/oom.h
index a16a155..6eb72b9 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -10,13 +10,6 @@
 #include <linux/sched/coredump.h> /* MMF_* */
 #include <linux/mm.h> /* VM_FAULT* */
 
-
-/*
- * Special value returned by victim selection functions to indicate
- * that are inflight OOM victims.
- */
-#define INFLIGHT_VICTIM ((void *)-1UL)
-
 struct zonelist;
 struct notifier_block;
 struct mem_cgroup;
@@ -131,7 +124,7 @@ extern unsigned long oom_badness(struct task_struct *p,
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
-extern int oom_evaluate_task(struct task_struct *task, void *arg);
+extern void oom_evaluate_task(struct task_struct *task, void *arg);
 
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4c66d03..a2d4719 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1175,7 +1175,7 @@ struct task_struct {
 #endif
 	int				pagefault_disabled;
 #ifdef CONFIG_MMU
-	struct task_struct		*oom_reaper_list;
+	struct list_head		oom_victim_list;
 #endif
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4603ad7..2b33456 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1058,33 +1058,29 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
  * @arg: argument passed to @fn
  *
  * This function iterates over tasks attached to @memcg or to any of its
- * descendants and calls @fn for each task. If @fn returns a non-zero
- * value, the function breaks the iteration loop and returns the value.
- * Otherwise, it will iterate over all tasks and return 0.
+ * descendants and calls @fn for each task.
  *
  * If memcg is the root memory cgroup, this function will iterate only
  * over tasks belonging directly to the root memory cgroup.
  */
-int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
-			  int (*fn)(struct task_struct *, void *), void *arg)
+void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+			   void (*fn)(struct task_struct *, void *), void *arg)
 {
 	struct mem_cgroup *iter;
-	int ret = 0;
 
 	for_each_mem_cgroup_tree(iter, memcg) {
 		struct css_task_iter it;
 		struct task_struct *task;
 
 		css_task_iter_start(&iter->css, 0, &it);
-		while (!ret && (task = css_task_iter_next(&it)))
-			ret = fn(task, arg);
+		while (!(task = css_task_iter_next(&it)))
+			fn(task, arg);
 		css_task_iter_end(&it);
-		if (ret || memcg == root_mem_cgroup) {
+		if (memcg == root_mem_cgroup) {
 			mem_cgroup_iter_break(memcg, iter);
 			break;
 		}
 	}
-	return ret;
 }
 
 /**
@@ -2854,7 +2850,6 @@ static long memcg_oom_badness(struct mem_cgroup *memcg,
 /*
  * Checks if the given memcg is a valid OOM victim and returns a number,
  * which means the folowing:
- *   -1: there are inflight OOM victim tasks, belonging to the memcg
  *    0: memcg is not eligible, e.g. all belonging tasks are protected
  *       by oom_score_adj set to OOM_SCORE_ADJ_MIN
  *   >0: memcg is eligible, and the returned value is an estimation
@@ -2874,9 +2869,6 @@ static long oom_evaluate_memcg(struct mem_cgroup *memcg,
 	 * leaf memory cgroups, so we approximate it's oom_score
 	 * by summing oom_score of all belonging tasks, which are
 	 * owners of their mm structs.
-	 *
-	 * If there are inflight OOM victim tasks inside
-	 * the root memcg, we return -1.
 	 */
 	if (memcg == root_mem_cgroup) {
 		struct css_task_iter it;
@@ -2884,24 +2876,9 @@ static long oom_evaluate_memcg(struct mem_cgroup *memcg,
 		long score = 0;
 
 		css_task_iter_start(&memcg->css, 0, &it);
-		while ((task = css_task_iter_next(&it))) {
-			if (tsk_is_oom_victim(task) &&
-			    !test_bit(MMF_OOM_SKIP,
-				      &task->signal->oom_mm->flags)) {
-				score = -1;
-				break;
-			}
-
-			task_lock(task);
-			if (!task->mm) {
-				task_unlock(task);
-				continue;
-			}
-			task_unlock(task);
-
+		while ((task = css_task_iter_next(&it)))
 			score += oom_badness(task, memcg, nodemask,
 					     totalpages);
-		}
 		css_task_iter_end(&it);
 
 		return score;
@@ -2912,25 +2889,17 @@ static long oom_evaluate_memcg(struct mem_cgroup *memcg,
 	 *
 	 * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN
 	 * as unkillable.
-	 *
-	 * If there are inflight OOM victim tasks inside the memcg,
-	 * we return -1.
 	 */
 	css_task_iter_start(&memcg->css, 0, &it);
 	while ((task = css_task_iter_next(&it))) {
-		if (!eligible &&
-		    task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN)
+		if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 			eligible = 1;
-
-		if (tsk_is_oom_victim(task) &&
-		    !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) {
-			eligible = -1;
 			break;
 		}
 	}
 	css_task_iter_end(&it);
 
-	if (eligible <= 0)
+	if (eligible == 0)
 		return eligible;
 
 	return memcg_oom_badness(memcg, nodemask, totalpages);
@@ -2994,16 +2963,6 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
 		if (score == 0)
 			continue;
 
-		/*
-		 * If there are inflight OOM victims, we don't need
-		 * to look further for new victims.
-		 */
-		if (score == -1) {
-			oc->chosen_memcg = INFLIGHT_VICTIM;
-			mem_cgroup_iter_break(root, iter);
-			break;
-		}
-
 		group_score += score;
 
 		if (group_score > oc->chosen_points) {
@@ -3012,7 +2971,7 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
 		}
 	}
 
-	if (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM)
+	if (oc->chosen_memcg)
 		css_get(&oc->chosen_memcg->css);
 
 	rcu_read_unlock();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 104ef4a..b3e6157 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -312,25 +312,13 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
 	return CONSTRAINT_NONE;
 }
 
-int oom_evaluate_task(struct task_struct *task, void *arg)
+void oom_evaluate_task(struct task_struct *task, void *arg)
 {
 	struct oom_control *oc = arg;
 	unsigned long points;
 
 	if (oom_unkillable_task(task, NULL, oc->nodemask))
-		goto next;
-
-	/*
-	 * This task already has access to memory reserves and is being killed.
-	 * Don't allow any other task to have access to the reserves unless
-	 * the task has MMF_OOM_SKIP because chances that it would release
-	 * any memory is quite low.
-	 */
-	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
-		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
-			goto next;
-		goto abort;
-	}
+		return;
 
 	/*
 	 * If task is allocating a lot of memory and has been marked to be
@@ -343,29 +331,22 @@ int oom_evaluate_task(struct task_struct *task, void *arg)
 
 	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
 	if (!points || points < oc->chosen_points)
-		goto next;
+		return;
 
 	/* Prefer thread group leaders for display purposes */
 	if (points == oc->chosen_points && thread_group_leader(oc->chosen_task))
-		goto next;
+		return;
 select:
 	if (oc->chosen_task)
 		put_task_struct(oc->chosen_task);
 	get_task_struct(task);
 	oc->chosen_task = task;
 	oc->chosen_points = points;
-next:
-	return 0;
-abort:
-	if (oc->chosen_task)
-		put_task_struct(oc->chosen_task);
-	oc->chosen_task = INFLIGHT_VICTIM;
-	return 1;
 }
 
 /*
  * Simple selection loop. We choose the process with the highest number of
- * 'points'. In case scan was aborted, oc->chosen_task is set to -1.
+ * 'points'.
  */
 static void select_bad_process(struct oom_control *oc)
 {
@@ -376,8 +357,7 @@ static void select_bad_process(struct oom_control *oc)
 
 		rcu_read_lock();
 		for_each_process(p)
-			if (oom_evaluate_task(p, oc))
-				break;
+			oom_evaluate_task(p, oc);
 		rcu_read_unlock();
 	}
 
@@ -492,7 +472,7 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
  */
 static struct task_struct *oom_reaper_th;
 static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
-static struct task_struct *oom_reaper_list;
+static LIST_HEAD(oom_victim_list);
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
 bool __oom_reap_task_mm(struct mm_struct *mm)
@@ -598,14 +578,16 @@ static void oom_reap_task(struct task_struct *tsk)
 	debug_show_all_locks();
 
 done:
-	tsk->oom_reaper_list = NULL;
-
 	/*
 	 * Hide this mm from OOM killer because it has been either reaped or
 	 * somebody can't call up_write(mmap_sem).
 	 */
 	set_bit(MMF_OOM_SKIP, &mm->flags);
 
+	spin_lock(&oom_reaper_lock);
+	list_del(&tsk->oom_victim_list);
+	spin_unlock(&oom_reaper_lock);
+
 	/* Drop a reference taken by wake_oom_reaper */
 	put_task_struct(tsk);
 }
@@ -615,12 +597,13 @@ static int oom_reaper(void *unused)
 	while (true) {
 		struct task_struct *tsk = NULL;
 
-		wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
+		wait_event_freezable(oom_reaper_wait,
+				     !list_empty(&oom_victim_list));
 		spin_lock(&oom_reaper_lock);
-		if (oom_reaper_list != NULL) {
-			tsk = oom_reaper_list;
-			oom_reaper_list = tsk->oom_reaper_list;
-		}
+		if (!list_empty(&oom_victim_list))
+			tsk = list_first_entry(&oom_victim_list,
+					       struct task_struct,
+					       oom_victim_list);
 		spin_unlock(&oom_reaper_lock);
 
 		if (tsk)
@@ -633,14 +616,13 @@ static int oom_reaper(void *unused)
 static void wake_oom_reaper(struct task_struct *tsk)
 {
 	/* tsk is already queued? */
-	if (tsk == oom_reaper_list || tsk->oom_reaper_list)
+	if (tsk->oom_victim_list.next)
 		return;
 
 	get_task_struct(tsk);
 
 	spin_lock(&oom_reaper_lock);
-	tsk->oom_reaper_list = oom_reaper_list;
-	oom_reaper_list = tsk;
+	list_add_tail(&tsk->oom_victim_list, &oom_victim_list);
 	spin_unlock(&oom_reaper_lock);
 	trace_wake_reaper(tsk->pid);
 	wake_up(&oom_reaper_wait);
@@ -981,17 +963,18 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	__oom_kill_process(victim);
 }
 
-static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+static void oom_kill_memcg_member(struct task_struct *task, void *unused)
 {
 	get_task_struct(task);
 	__oom_kill_process(task);
-	return 0;
 }
 
 static bool oom_kill_memcg_victim(struct oom_control *oc)
 {
-	if (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM)
-		return oc->chosen_memcg;
+	bool ret = true;
+
+	if (oc->chosen_memcg == NULL)
+		return false;
 
 	/*
 	 * If memory.oom_group is set, kill all tasks belonging to the sub-tree
@@ -1001,23 +984,18 @@ static bool oom_kill_memcg_victim(struct oom_control *oc)
 	if (mem_cgroup_oom_group(oc->chosen_memcg)) {
 		mem_cgroup_scan_tasks(oc->chosen_memcg, oom_kill_memcg_member,
 				      NULL);
-		/* We have one or more terminating processes at this point. */
-		oc->chosen_task = INFLIGHT_VICTIM;
 	} else {
 		oc->chosen_points = 0;
 		oc->chosen_task = NULL;
 		mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc);
 
-		if (oc->chosen_task == NULL ||
-		    oc->chosen_task == INFLIGHT_VICTIM)
-			goto out;
-
-		__oom_kill_process(oc->chosen_task);
+		if (!oc->chosen_task)
+			ret = false;
+		else
+			__oom_kill_process(oc->chosen_task);
 	}
-
-out:
 	mem_cgroup_put(oc->chosen_memcg);
-	return oc->chosen_task;
+	return ret;
 }
 
 /*
@@ -1058,6 +1036,34 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
+static bool oom_has_pending_victims(struct oom_control *oc)
+{
+#ifdef CONFIG_MMU
+	struct task_struct *p;
+
+	if (is_sysrq_oom(oc))
+		return false;
+	/*
+	 * Since oom_reap_task_mm()/exit_mmap() will set MMF_OOM_SKIP, let's
+	 * wait for pending victims until MMF_OOM_SKIP is set.
+	 */
+	spin_lock(&oom_reaper_lock);
+	list_for_each_entry(p, &oom_victim_list, oom_victim_list)
+		if (!oom_unkillable_task(p, oc->memcg, oc->nodemask) &&
+		    !test_bit(MMF_OOM_SKIP, &p->signal->oom_mm->flags))
+			break;
+	spin_unlock(&oom_reaper_lock);
+	return p != NULL;
+#else
+	/*
+	 * Since nobody except __oom_kill_process() sets MMF_OOM_SKIP, waiting
+	 * for pending victims until MMF_OOM_SKIP is set is useless. Therefore,
+	 * simply let the OOM killer select pending victims again.
+	 */
+	return false;
+#endif
+}
+
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  * @oc: pointer to struct oom_control
@@ -1070,7 +1076,6 @@ int unregister_oom_notifier(struct notifier_block *nb)
 bool out_of_memory(struct oom_control *oc)
 {
 	unsigned long freed = 0;
-	bool delay = false; /* if set, delay next allocation attempt */
 
 	oc->constraint = CONSTRAINT_NONE;
 	if (oom_killer_disabled)
@@ -1112,6 +1117,9 @@ bool out_of_memory(struct oom_control *oc)
 		oc->nodemask = NULL;
 	check_panic_on_oom(oc);
 
+	if (oom_has_pending_victims(oc))
+		return true;
+
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
 	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
@@ -1121,30 +1129,20 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
-	if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) {
-		delay = true;
-		goto out;
-	}
+	if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc))
+		return true;
 
 	select_bad_process(oc);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
-	if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
+	if (!oc->chosen_task) {
+		if (is_sysrq_oom(oc) || is_memcg_oom(oc))
+			return false;
 		dump_header(oc, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
-	if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM)
-		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
-				 "Memory cgroup out of memory");
-
-out:
-	/*
-	 * Give the killed process a good chance to exit before trying
-	 * to allocate memory again.
-	 */
-	if (delay)
-		schedule_timeout_killable(1);
-
-	return !!oc->chosen_task;
+	oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
+			 "Memory cgroup out of memory");
+	return true;
 }
 
 /*
-- 
1.8.3.1





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux