Re: [PATCH] mm, oom: Disable preemption during OOM-kill operation.

Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> · Sun, 27 Sep 2015 14:51:01 +0900

(Added Oleg, for he might want to combine memory unmapper kernel thread
and this OOM killer thread shown in this post.)

Michal Hocko wrote:
> On Wed 23-09-15 23:26:35, Tetsuo Handa wrote:
> [...]
> > Sprinkling preempt_{enable,disable} all around the oom path can temporarily
> > slow down threads with higher priority. But doing so can guarantee that
> > the oom path is not delayed indefinitely. Imagine a scenario where a task
> > with idle priority called the oom path and other tasks with normal or
> > realtime priority preempt. How long will we hold oom_lock and keep the
> > system under oom?
> 
> What I've tried to say is that the OOM killer context might get priority
> boost to make sure it makes sufficient progress. This would be much more
> systematic approach IMO than sprinkling preempt_{enable,disable} all over
> the place.

Unlike boosting priority of fatal_signal_pending() OOM victim threads,
we need to undo it after returning from out_of_memory(). And the priority
of current thread which is calling out_of_memory() can be manipulated by
other threads. In order to avoid loosing new priority by restoring old
priority after returning from out_of_memory(), a dedicated kernel thread
will be needed. I think we will use a kernel thread named OOM kiiler.
So, did you mean something like below?

------------------------------------------------------------

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 03e6257..29d6190a 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -31,6 +31,13 @@ struct oom_control {
 	 * for display purposes.
 	 */
 	const int order;
+
+	/* Used for comunicating with OOM-killer kernel thread */
+	struct list_head list;
+	struct task_struct *task;
+	unsigned long totalpages;
+	int cpu;
+	bool done;
 };
 
 /*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 03b612b..3b8edd0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -35,6 +35,8 @@
 #include <linux/freezer.h>
 #include <linux/ftrace.h>
 #include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/utsname.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/oom.h>
@@ -386,14 +388,23 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 static void dump_header(struct oom_control *oc, struct task_struct *p,
 			struct mem_cgroup *memcg)
 {
-	task_lock(current);
+	struct task_struct *task = oc->task;
+	task_lock(task);
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
 		"oom_score_adj=%hd\n",
-		current->comm, oc->gfp_mask, oc->order,
-		current->signal->oom_score_adj);
-	cpuset_print_task_mems_allowed(current);
-	task_unlock(current);
-	dump_stack();
+		task->comm, oc->gfp_mask, oc->order,
+		task->signal->oom_score_adj);
+	cpuset_print_task_mems_allowed(task);
+	task_unlock(task);
+	/* dump_lock logic is missing here. */
+	printk(KERN_DEFAULT "CPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
+	       oc->cpu, task->pid, task->comm,
+	       print_tainted(), init_utsname()->release,
+	       (int)strcspn(init_utsname()->version, " "),
+	       init_utsname()->version);
+	/* "Hardware name: " line is missing here. */
+	print_worker_info(KERN_DEFAULT, task);
+	show_stack(task, NULL);
 	if (memcg)
 		mem_cgroup_print_oom_info(memcg, p);
 	else
@@ -408,7 +419,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p,
 static atomic_t oom_victims = ATOMIC_INIT(0);
 static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
 
-bool oom_killer_disabled __read_mostly;
+bool oom_killer_disabled __read_mostly = true;
 
 /**
  * mark_oom_victim - mark the given task as OOM victim
@@ -647,6 +658,68 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
+static DECLARE_WAIT_QUEUE_HEAD(oom_request_wait);
+static DECLARE_WAIT_QUEUE_HEAD(oom_response_wait);
+static LIST_HEAD(oom_request_list);
+static DEFINE_SPINLOCK(oom_request_list_lock);
+
+static int oom_killer(void *unused)
+{
+	struct task_struct *p;
+	unsigned int uninitialized_var(points);
+	struct oom_control *oc;
+
+	/* Boost priority in order to send SIGKILL as soon as possible. */
+	set_user_nice(current, MIN_NICE);
+
+ start:
+	wait_event(oom_request_wait, !list_empty(&oom_request_list));
+	oc = NULL;
+	spin_lock(&oom_request_list_lock);
+	if (!list_empty(&oom_request_list))
+		oc = list_first_entry(&oom_request_list, struct oom_control, list);
+	spin_unlock(&oom_request_list_lock);
+	if (!oc)
+		goto start;
+	p = oc->task;
+
+	/* Disable preemption in order to send SIGKILL as soon as possible. */
+	preempt_disable();
+
+	if (sysctl_oom_kill_allocating_task && p->mm &&
+	    !oom_unkillable_task(p, NULL, oc->nodemask) &&
+	    p->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+		get_task_struct(p);
+		oom_kill_process(oc, p, 0, oc->totalpages, NULL,
+				 "Out of memory (oom_kill_allocating_task)");
+		goto end;
+	}
+
+	p = select_bad_process(oc, &points, oc->totalpages);
+	/* Found nothing?!?! Either we hang forever, or we panic. */
+	if (!p && !is_sysrq_oom(oc)) {
+		dump_header(oc, NULL, NULL);
+		panic("Out of memory and no killable processes...\n");
+	}
+	if (p && p != (void *)-1UL)
+		oom_kill_process(oc, p, points, oc->totalpages, NULL,
+				 "Out of memory");
+ end:
+	preempt_enable();
+	oc->done = true;
+	wake_up_all(&oom_response_wait);
+	goto start;
+}
+
+static int __init run_oom_killer(void)
+{
+	struct task_struct *task = kthread_run(oom_killer, NULL, "OOM-killer");
+	BUG_ON(IS_ERR(task));
+	oom_killer_disabled = false;
+	return 0;
+}
+postcore_initcall(run_oom_killer);
+
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  * @oc: pointer to struct oom_control
@@ -658,10 +731,8 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
  */
 bool out_of_memory(struct oom_control *oc)
 {
-	struct task_struct *p;
 	unsigned long totalpages;
 	unsigned long freed = 0;
-	unsigned int uninitialized_var(points);
 	enum oom_constraint constraint = CONSTRAINT_NONE;
 
 	if (oom_killer_disabled)
@@ -672,6 +743,7 @@ bool out_of_memory(struct oom_control *oc)
 		/* Got some memory back in the last second. */
 		return true;
 
+	oc->task = current;
 	/*
 	 * If current has a pending SIGKILL or is exiting, then automatically
 	 * select it.  The goal is to allow it to allocate so that it may
@@ -695,30 +767,23 @@ bool out_of_memory(struct oom_control *oc)
 		oc->nodemask = NULL;
 	check_panic_on_oom(oc, constraint, NULL);
 
-	if (sysctl_oom_kill_allocating_task && current->mm &&
-	    !oom_unkillable_task(current, NULL, oc->nodemask) &&
-	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
-		get_task_struct(current);
-		oom_kill_process(oc, current, 0, totalpages, NULL,
-				 "Out of memory (oom_kill_allocating_task)");
-		return true;
-	}
-
-	p = select_bad_process(oc, &points, totalpages);
-	/* Found nothing?!?! Either we hang forever, or we panic. */
-	if (!p && !is_sysrq_oom(oc)) {
-		dump_header(oc, NULL, NULL);
-		panic("Out of memory and no killable processes...\n");
-	}
-	if (p && p != (void *)-1UL) {
-		oom_kill_process(oc, p, points, totalpages, NULL,
-				 "Out of memory");
-		/*
-		 * Give the killed process a good chance to exit before trying
-		 * to allocate memory again.
-		 */
-		schedule_timeout_killable(1);
-	}
+	/* OK. Let's wait for OOM killer. */
+	oc->cpu = raw_smp_processor_id();
+	oc->totalpages = totalpages;		
+	oc->done = false;
+	spin_lock(&oom_request_list_lock);
+	list_add(&oc->list, &oom_request_list);
+	spin_unlock(&oom_request_list_lock);
+	wake_up(&oom_request_wait);
+	wait_event(oom_response_wait, oc->done);
+	spin_lock(&oom_request_list_lock);
+	list_del(&oc->list);
+	spin_unlock(&oom_request_list_lock);
+	/*
+	 * Give the killed process a good chance to exit before trying
+	 * to allocate memory again.
+	 */
+	schedule_timeout_killable(1);
 	return true;
 }
 
------------------------------------------------------------

By the way, I think that we might want to omit dump_header() call
if the OOM victim's mm was already reported by previous OOM events
because output by show_mem() and dump_tasks() in dump_header() is
noisy. All OOM events between uptime 110 and 303 of
http://I-love.SAKURA.ne.jp/tmp/serial-20150927.txt.xz are choosing
the same mm. Even if the first OOM event completed within a few
seconds by disabling preemption, subsequent OOM events which
sequentially choose OOM victims without TIF_MEMDIE consumed many
seconds after all.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>