[RFC 2/2] OOM, PM: make OOM detection in the freezer path raceless

Michal Hocko <mhocko@xxxxxxx> · Tue, 18 Nov 2014 22:10:06 +0100

5695be142e20 (OOM, PM: OOM killed task shouldn't escape PM suspend)
has left a race window when OOM killer manages to note_oom_kill after
freeze_processes checks the counter. The race window is quite small and
really unlikely and partial solution deemed sufficient at the time of
submission.

Tejun wasn't happy about this partial solution though and insisted on a
full solution. That requires the full OOM and freezer's task freezing
exclusion, though. This is done by this patch which introduces oom_sem
RW lock and turns oom_killer_disable() into a full OOM barrier.

oom_killer_disabled is now checked at out_of_memory level which takes
the lock for reading. This also means that the page fault path is
covered now as well although it was assumed to be safe before. As per
Tejun, "We used to have freezing points deep in file system code which
may be reacheable from page fault." so it would be better and more
robust to not rely on freezing points here. Same applies to the memcg
OOM killer.

out_of_memory tells the caller whether the OOM was allowed to
trigger and the callers are supposed to handle the situation. The page
allocation path simply fails the allocation same as before. The page
fault path will be retrying the fault until the freezer fails and Sysrq
OOM trigger will simply complain to the log.

oom_killer_disable takes oom_sem for writing and after it disables
further OOM killer invocations it checks for any OOM victims which
are still alive (because they haven't woken up to handle the pending
signal). Victims are counted via {un}mark_tsk_oom_victim. The
last victim signals the completion via oom_victims_wait on which
oom_killer_disable() waits if it sees non zero oom_victims.
This is safe against both mark_tsk_oom_victim which cannot be called
after oom_killer_disabled is set and unmark_tsk_oom_victim signals the
completion only for the last oom_victim when oom is disabled and
oom_killer_disable waits for completion only of there was at least one
victim at the time it disabled the oom.

As oom_killer_disable is a full OOM barrier now we can postpone it to
later after all freezable tasks are frozen during PM freezer. This
reduces the time when OOM is put out order and so reduces chances of
misbehavior due to unexpected allocation failures.

TODO:
Android lowmemory killer abuses mark_tsk_oom_victim in lowmem_scan
and it has to learn about oom_disable logic as well.

Suggested-by: Tejun Heo <tj@xxxxxxxxxx>
Signed-off-by: Michal Hocko <mhocko@xxxxxxx>
---
 drivers/tty/sysrq.c    |  6 ++--
 include/linux/oom.h    | 26 ++++++++------
 kernel/power/process.c | 60 +++++++++-----------------------
 mm/memcontrol.c        |  4 ++-
 mm/oom_kill.c          | 94 +++++++++++++++++++++++++++++++++++++++++---------
 mm/page_alloc.c        | 32 ++++++++---------
 6 files changed, 132 insertions(+), 90 deletions(-)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 42bad18c66c9..6818589c1004 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -355,8 +355,10 @@ static struct sysrq_key_op sysrq_term_op = {
 
 static void moom_callback(struct work_struct *ignored)
 {
-	out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL,
-		      0, NULL, true);
+	if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
+			   GFP_KERNEL, 0, NULL, true)) {
+		printk(KERN_INFO "OOM request ignored because killer is disabled\n");
+	}
 }
 
 static DECLARE_WORK(moom_work, moom_callback);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 8f7e74f8ab3a..d802575c9307 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,22 +72,26 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		unsigned long totalpages, const nodemask_t *nodemask,
 		bool force_kill);
 
-extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *mask, bool force_kill);
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 
-extern bool oom_killer_disabled;
-
-static inline void oom_killer_disable(void)
-{
-	oom_killer_disabled = true;
-}
+/**
+ * oom_killer_disable - disable OOM killer
+ *
+ * Forces all page allocations to fail rather than trigger OOM killer.
+ * Will block and wait until all OOM victims are dead.
+ *
+ * Returns true if successfull and false if the OOM killer cannot be
+ * disabled.
+ */
+extern bool oom_killer_disable(void);
 
-static inline void oom_killer_enable(void)
-{
-	oom_killer_disabled = false;
-}
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+extern void oom_killer_enable(void);
 
 static inline bool oom_gfp_allowed(gfp_t gfp_mask)
 {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5a6ec8678b9a..a4306e39f35c 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -108,30 +108,6 @@ static int try_to_freeze_tasks(bool user_only)
 	return todo ? -EBUSY : 0;
 }
 
-static bool __check_frozen_processes(void)
-{
-	struct task_struct *g, *p;
-
-	for_each_process_thread(g, p)
-		if (p != current && !freezer_should_skip(p) && !frozen(p))
-			return false;
-
-	return true;
-}
-
-/*
- * Returns true if all freezable tasks (except for current) are frozen already
- */
-static bool check_frozen_processes(void)
-{
-	bool ret;
-
-	read_lock(&tasklist_lock);
-	ret = __check_frozen_processes();
-	read_unlock(&tasklist_lock);
-	return ret;
-}
-
 /**
  * freeze_processes - Signal user space processes to enter the refrigerator.
  * The current thread will not be frozen.  The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
 int freeze_processes(void)
 {
 	int error;
-	int oom_kills_saved;
 
 	error = __usermodehelper_disable(UMH_FREEZING);
 	if (error)
@@ -157,27 +132,11 @@ int freeze_processes(void)
 	pm_wakeup_clear();
 	printk("Freezing user space processes ... ");
 	pm_freezing = true;
-	oom_kills_saved = oom_kills_count();
 	error = try_to_freeze_tasks(true);
 	if (!error) {
 		__usermodehelper_set_disable_depth(UMH_DISABLED);
-		oom_killer_disable();
-
-		/*
-		 * There might have been an OOM kill while we were
-		 * freezing tasks and the killed task might be still
-		 * on the way out so we have to double check for race.
-		 */
-		if (oom_kills_count() != oom_kills_saved &&
-		    !check_frozen_processes()) {
-			__usermodehelper_set_disable_depth(UMH_ENABLED);
-			printk("OOM in progress.");
-			error = -EBUSY;
-		} else {
-			printk("done.");
-		}
+		printk("done.\n");
 	}
-	printk("\n");
 	BUG_ON(in_atomic());
 
 	if (error)
@@ -206,6 +165,18 @@ int freeze_kernel_threads(void)
 	printk("\n");
 	BUG_ON(in_atomic());
 
+	/*
+	 * Now that everything freezable is handled we need to disbale
+	 * the OOM killer to disallow any further interference with
+	 * killable tasks.
+	 */
+	printk("Disabling OOM killer ... ");
+	if (!oom_killer_disable()) {
+		printk("failed.\n");
+		error = -EAGAIN;
+	} else
+		printk("done.\n");
+
 	if (error)
 		thaw_kernel_threads();
 	return error;
@@ -222,8 +193,6 @@ void thaw_processes(void)
 	pm_freezing = false;
 	pm_nosig_freezing = false;
 
-	oom_killer_enable();
-
 	printk("Restarting tasks ... ");
 
 	__usermodehelper_set_disable_depth(UMH_FREEZING);
@@ -251,6 +220,9 @@ void thaw_kernel_threads(void)
 {
 	struct task_struct *g, *p;
 
+	printk("Enabling OOM killer again.\n");
+	oom_killer_enable();
+
 	pm_nosig_freezing = false;
 	printk("Restarting kernel threads ... ");
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 302e0fc6d121..34bcbb053132 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2128,6 +2128,8 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 	current->memcg_oom.order = order;
 }
 
+extern bool oom_killer_disabled;
+
 /**
  * mem_cgroup_oom_synchronize - complete memcg OOM handling
  * @handle: actually kill/wait or just clean up the OOM state
@@ -2155,7 +2157,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
 	if (!memcg)
 		return false;
 
-	if (!handle)
+	if (!handle || oom_killer_disabled)
 		goto cleanup;
 
 	owait.memcg = memcg;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8b6e14136f4f..b3ccd92bc6dc 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -405,30 +405,63 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 
 /*
- * Number of OOM killer invocations (including memcg OOM killer).
- * Primarily used by PM freezer to check for potential races with
- * OOM killed frozen task.
+ * Number of OOM victims in flight
  */
-static atomic_t oom_kills = ATOMIC_INIT(0);
+static atomic_t oom_victims = ATOMIC_INIT(0);
+static DECLARE_COMPLETION(oom_victims_wait);
 
-int oom_kills_count(void)
+bool oom_killer_disabled __read_mostly;
+static DECLARE_RWSEM(oom_sem);
+
+void mark_tsk_oom_victim(struct task_struct *tsk)
 {
-	return atomic_read(&oom_kills);
+	BUG_ON(oom_killer_disabled);
+	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
+		return;
+	atomic_inc(&oom_victims);
 }
 
-void note_oom_kill(void)
+void unmark_tsk_oom_victim(struct task_struct *tsk)
 {
-	atomic_inc(&oom_kills);
+	int count;
+
+	if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
+		return;
+
+	down_read(&oom_sem);
+	/*
+	 * There is no need to signal the lasst oom_victim if there
+	 * is nobody who cares.
+	 */
+	if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+		complete(&oom_victims_wait);
+	up_read(&oom_sem);
 }
 
-void mark_tsk_oom_victim(struct task_struct *tsk)
+bool oom_killer_disable(void)
 {
-	set_tsk_thread_flag(tsk, TIF_MEMDIE);
+	/*
+	 * Make sure to not race with an ongoing OOM killer
+	 * and that the current is not the victim.
+	 */
+	down_write(&oom_sem);
+	if (!test_tsk_thread_flag(current, TIF_MEMDIE))
+		oom_killer_disabled = true;
+
+	count = atomic_read(&oom_victims);
+	up_write(&oom_sem);
+
+	if (count && oom_killer_disabled)
+		wait_for_completion(&oom_victims_wait);
+
+	return oom_killer_disabled;
 }
 
-void unmark_tsk_oom_victim(struct task_struct *tsk)
+void oom_killer_enable(void)
 {
-	clear_thread_flag(TIF_MEMDIE);
+	down_write(&oom_sem);
+	oom_killer_disabled = false;
+	up_write(&oom_sem);
 }
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -626,7 +659,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
 }
 
 /**
- * out_of_memory - kill the "best" process when we run out of memory
+ * __out_of_memory - kill the "best" process when we run out of memory
  * @zonelist: zonelist pointer
  * @gfp_mask: memory allocation flags
  * @order: amount of memory being requested as a power of 2
@@ -638,7 +671,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *nodemask, bool force_kill)
 {
 	const nodemask_t *mpol_mask;
@@ -703,6 +736,31 @@ out:
 		schedule_timeout_killable(1);
 }
 
+/** out_of_memory -  tries to invoke OOM killer.
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
+ *
+ * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
+ * when it returns false. Otherwise returns true.
+ */
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+		int order, nodemask_t *nodemask, bool force_kill)
+{
+	bool ret = false;
+
+	down_read(&oom_sem);
+	if (!oom_killer_disabled) {
+		__out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
+		ret = true;
+	}
+	up_read(&oom_sem);
+
+	return ret;
+}
+
 /*
  * The pagefault handler calls here because it is out of memory, so kill a
  * memory-hogging task.  If any populated zone has ZONE_OOM_LOCKED set, a
@@ -712,12 +770,16 @@ void pagefault_out_of_memory(void)
 {
 	struct zonelist *zonelist;
 
+	down_read(&oom_sem);
 	if (mem_cgroup_oom_synchronize(true))
-		return;
+		goto unlock;
 
 	zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
 	if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
-		out_of_memory(NULL, 0, 0, NULL, false);
+		if (!oom_killer_disabled)
+			__out_of_memory(NULL, 0, 0, NULL, false);
 		oom_zonelist_unlock(zonelist, GFP_KERNEL);
 	}
+unlock:
+	up_read(&oom_sem);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9cd36b822444..d44d69aa7b70 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -242,8 +242,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
 					PB_migrate, PB_migrate_end);
 }
 
-bool oom_killer_disabled __read_mostly;
-
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -2241,10 +2239,11 @@ static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
-	int classzone_idx, int migratetype)
+	int classzone_idx, int migratetype, bool *oom_failed)
 {
 	struct page *page;
 
+	*oom_failed = false;
 	/* Acquire the per-zone oom lock for each zone */
 	if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
 		schedule_timeout_uninterruptible(1);
@@ -2252,14 +2251,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	}
 
 	/*
-	 * PM-freezer should be notified that there might be an OOM killer on
-	 * its way to kill and wake somebody up. This is too early and we might
-	 * end up not killing anything but false positives are acceptable.
-	 * See freeze_processes.
-	 */
-	note_oom_kill();
-
-	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
 	 * here, this is only to catch a parallel oom killing, we must fail if
 	 * we're still under heavy pressure.
@@ -2289,8 +2280,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
-	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
-
+	if (!out_of_memory(zonelist, gfp_mask, order, nodemask, false))
+		*oom_failed = true;
 out:
 	oom_zonelist_unlock(zonelist, gfp_mask);
 	return page;
@@ -2716,8 +2707,8 @@ rebalance:
 	 */
 	if (!did_some_progress) {
 		if (oom_gfp_allowed(gfp_mask)) {
-			if (oom_killer_disabled)
-				goto nopage;
+			bool oom_failed;
+
 			/* Coredumps can quickly deplete all memory reserves */
 			if ((current->flags & PF_DUMPCORE) &&
 			    !(gfp_mask & __GFP_NOFAIL))
@@ -2725,10 +2716,19 @@ rebalance:
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask, preferred_zone,
-					classzone_idx, migratetype);
+					classzone_idx, migratetype,
+					&oom_failed);
+
 			if (page)
 				goto got_pg;
 
+			/*
+			 * OOM killer might be disabled and then we have to
+			 * fail the allocation
+			 */
+			if (oom_failed)
+				goto nopage;
+
 			if (!(gfp_mask & __GFP_NOFAIL)) {
 				/*
 				 * The oom killer is not called for high-order
-- 
2.1.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>