[PATCH] mm,oom: Do not sleep with oom_lock held.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Michal, before we think about whether to add preempt_disable()/preempt_enable_no_resched()
to oom_kill_process(), will you accept this patch?
This is one of problems which annoy kmallocwd patch on CONFIG_PREEMPT_NONE=y kernels.

---------- sleep-with-oom_lock.c start ----------
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sched.h>
#include <signal.h>
#include <sys/prctl.h>

int main(int argc, char *argv[])
{
	struct sched_param sp = { 0 };
	cpu_set_t cpu = { { 1 } };
	static int pipe_fd[2] = { EOF, EOF };
	char *buf = NULL;
	unsigned long size = 0;
	unsigned int i;
	int fd;
	pipe(pipe_fd);
	signal(SIGCLD, SIG_IGN);
	if (fork() == 0) {
		prctl(PR_SET_NAME, (unsigned long) "first-victim", 0, 0, 0);
		while (1)
			pause();
	}
	close(pipe_fd[1]);
	sched_setaffinity(0, sizeof(cpu), &cpu);
	prctl(PR_SET_NAME, (unsigned long) "normal-priority", 0, 0, 0);
	for (i = 0; i < 64; i++)
		if (fork() == 0) {
			char c;
			/* Wait until the first-victim is OOM-killed. */
			read(pipe_fd[0], &c, 1);
			/* Try to consume as much CPU time as possible. */
			while(1);
			_exit(0);
		}
	close(pipe_fd[0]);
	fd = open("/dev/zero", O_RDONLY);
	for (size = 1048576; size < 512UL * (1 << 30); size <<= 1) {
		char *cp = realloc(buf, size);
		if (!cp) {
			size >>= 1;
			break;
		}
		buf = cp;
	}
	sched_setscheduler(0, SCHED_IDLE, &sp);
	prctl(PR_SET_NAME, (unsigned long) "idle-priority", 0, 0, 0);
	read(fd, buf, size); /* Will cause OOM due to overcommit */
	kill(-1, SIGKILL);
	return 0; /* Not reached. */
}
---------- sleep-with-oom_lock.c end ----------

---------- console log start ----------
[  915.132305] CPU: 0 PID: 1341 Comm: idle-priority Not tainted 4.5.0-rc6-next-20160301 #89
[  915.137977] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013
[  915.144914]  0000000000000286 0000000000291f95 ffff88003b06f860 ffffffff8131424d
[  915.150071]  0000000000000000 ffff88003b06fa98 ffff88003b06f900 ffffffff811b9934
[  915.155231]  0000000000000206 ffffffff8182b7b0 ffff88003b06f8a0 ffffffff810bae39
[  915.160540] Call Trace:
[  915.162963]  [<ffffffff8131424d>] dump_stack+0x85/0xc8
[  915.166689]  [<ffffffff811b9934>] dump_header+0x5b/0x394
[  915.170509]  [<ffffffff810bae39>] ? trace_hardirqs_on_caller+0xf9/0x1c0
[  915.175324]  [<ffffffff810baf0d>] ? trace_hardirqs_on+0xd/0x10
[  915.179497]  [<ffffffff81142286>] oom_kill_process+0x376/0x570
[  915.183661]  [<ffffffff811426d6>] out_of_memory+0x206/0x5a0
[  915.187651]  [<ffffffff81142794>] ? out_of_memory+0x2c4/0x5a0
[  915.191706]  [<ffffffff811485b2>] __alloc_pages_nodemask+0xbe2/0xd90
[  915.196124]  [<ffffffff811934d6>] alloc_pages_vma+0xb6/0x290
[  915.200128]  [<ffffffff81170af8>] handle_mm_fault+0x12d8/0x16f0
[  915.204299]  [<ffffffff8116f868>] ? handle_mm_fault+0x48/0x16f0
[  915.208467]  [<ffffffff8105c089>] ? __do_page_fault+0x129/0x4f0
[  915.212599]  [<ffffffff8105c127>] __do_page_fault+0x1c7/0x4f0
[  915.216653]  [<ffffffff8105c480>] do_page_fault+0x30/0x80
[  915.220447]  [<ffffffff81669668>] page_fault+0x28/0x30
[  915.224311]  [<ffffffff81321f0d>] ? __clear_user+0x3d/0x70
[  915.228155]  [<ffffffff81321eee>] ? __clear_user+0x1e/0x70
[  915.231988]  [<ffffffff81326a98>] iov_iter_zero+0x68/0x250
[  915.235771]  [<ffffffff81400ed8>] read_iter_zero+0x38/0xa0
[  915.239517]  [<ffffffff811bd934>] __vfs_read+0xc4/0xf0
[  915.243041]  [<ffffffff811be49a>] vfs_read+0x7a/0x120
[  915.246526]  [<ffffffff811bed43>] SyS_read+0x53/0xd0
[  915.249922]  [<ffffffff8100364d>] do_syscall_64+0x5d/0x180
[  915.253613]  [<ffffffff81667bff>] entry_SYSCALL64_slow_path+0x25/0x25
(...snipped...)
[  915.410964] Out of memory: Kill process 1341 (idle-priority) score 846 or sacrifice child
[  915.416430] Killed process 1347 (normal-priority) total-vm:4172kB, anon-rss:80kB, file-rss:0kB, shmem-rss:0kB
(...snipped...)
[ 1066.855742] idle-priority   R  running task        0  1341   1316 0x00000080
[ 1066.861076]  ffff88003b06f898 ffff88003eb74080 ffff880039f3a000 ffff88003b070000
[ 1066.866715]  ffff88003b06f8d0 ffff88003c610240 000000010009635f ffffffff81c0dbd8
[ 1066.872338]  ffff88003b06f8b0 ffffffff81662ce0 ffff88003c610240 ffff88003b06f958
[ 1066.877976] Call Trace:
[ 1066.880120]  [<ffffffff81662ce0>] schedule+0x30/0x80
[ 1066.883963]  [<ffffffff81666d17>] schedule_timeout+0x117/0x1c0
[ 1066.888391]  [<ffffffff810dda00>] ? init_timer_key+0x40/0x40
[ 1066.892675]  [<ffffffff81666df9>] schedule_timeout_killable+0x19/0x20
[ 1066.897518]  [<ffffffff811426e0>] out_of_memory+0x210/0x5a0
[ 1066.901779]  [<ffffffff81142794>] ? out_of_memory+0x2c4/0x5a0
[ 1066.906153]  [<ffffffff811485b2>] __alloc_pages_nodemask+0xbe2/0xd90
[ 1066.910965]  [<ffffffff811934d6>] alloc_pages_vma+0xb6/0x290
[ 1066.915371]  [<ffffffff81170af8>] handle_mm_fault+0x12d8/0x16f0
[ 1066.919868]  [<ffffffff8116f868>] ? handle_mm_fault+0x48/0x16f0
[ 1066.924457]  [<ffffffff8105c089>] ? __do_page_fault+0x129/0x4f0
[ 1066.929197]  [<ffffffff8105c127>] __do_page_fault+0x1c7/0x4f0
[ 1066.933805]  [<ffffffff8105c480>] do_page_fault+0x30/0x80
[ 1066.938236]  [<ffffffff81669668>] page_fault+0x28/0x30
[ 1066.942352]  [<ffffffff81321f0d>] ? __clear_user+0x3d/0x70
[ 1066.946536]  [<ffffffff81321eee>] ? __clear_user+0x1e/0x70
[ 1066.950707]  [<ffffffff81326a98>] iov_iter_zero+0x68/0x250
[ 1066.954920]  [<ffffffff81400ed8>] read_iter_zero+0x38/0xa0
[ 1066.959044]  [<ffffffff811bd934>] __vfs_read+0xc4/0xf0
[ 1066.962965]  [<ffffffff811be49a>] vfs_read+0x7a/0x120
[ 1066.966825]  [<ffffffff811bed43>] SyS_read+0x53/0xd0
[ 1066.970615]  [<ffffffff8100364d>] do_syscall_64+0x5d/0x180
[ 1066.974741]  [<ffffffff81667bff>] entry_SYSCALL64_slow_path+0x25/0x25
(...snipped...)
[ 1312.850193] sysrq: SysRq : Manual OOM execution
[ 1440.303946] INFO: task kworker/3:1:46 blocked for more than 120 seconds.
[ 1440.309844]       Not tainted 4.5.0-rc6-next-20160301 #89
[ 1440.314332] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1440.320536] kworker/3:1     D ffff880039217ca8     0    46      2 0x00000000
[ 1440.326079] Workqueue: events moom_callback
[ 1440.329612]  ffff880039217ca8 ffff88003be120c0 ffff880039212000 ffff880039218000
[ 1440.335436]  ffffffff81c7a568 0000000000000246 ffff880039212000 00000000ffffffff
[ 1440.341197]  ffff880039217cc0 ffffffff81662ce0 ffffffff81c7a560 ffff880039217cd0
[ 1440.347021] Call Trace:
[ 1440.349347]  [<ffffffff81662ce0>] schedule+0x30/0x80
[ 1440.353419]  [<ffffffff81662fe9>] schedule_preempt_disabled+0x9/0x10
[ 1440.358324]  [<ffffffff81664b9f>] mutex_lock_nested+0x14f/0x3a0
[ 1440.362667]  [<ffffffff813e3f6e>] ? moom_callback+0x6e/0xb0
[ 1440.367123]  [<ffffffff813e3f6e>] moom_callback+0x6e/0xb0
[ 1440.371381]  [<ffffffff8108a955>] process_one_work+0x1a5/0x400
[ 1440.375863]  [<ffffffff8108a8f1>] ? process_one_work+0x141/0x400
[ 1440.380475]  [<ffffffff8108acd6>] worker_thread+0x126/0x490
[ 1440.384827]  [<ffffffff816625a4>] ? __schedule+0x314/0xa20
[ 1440.389139]  [<ffffffff8108abb0>] ? process_one_work+0x400/0x400
[ 1440.393820]  [<ffffffff81090c9e>] kthread+0xee/0x110
[ 1440.397749]  [<ffffffff81667d72>] ret_from_fork+0x22/0x50
[ 1440.401950]  [<ffffffff81090bb0>] ? kthread_create_on_node+0x230/0x230
[ 1440.406862] 3 locks held by kworker/3:1/46:
[ 1440.410200]  #0:  ("events"){.+.+.+}, at: [<ffffffff8108a8f1>] process_one_work+0x141/0x400
[ 1440.417208]  #1:  (moom_work){+.+...}, at: [<ffffffff8108a8f1>] process_one_work+0x141/0x400
[ 1440.423539]  #2:  (oom_lock){+.+...}, at: [<ffffffff813e3f6e>] moom_callback+0x6e/0xb0
(...snipped...)
[ 1525.328487] idle-priority   R  running task        0  1341   1316 0x00000080
[ 1525.333576]  ffff88003b06f898 ffff88003eb74080 ffff880039f3a000 ffff88003b070000
[ 1525.339361]  ffff88003b06f8d0 ffff88003c610240 000000010009635f ffffffff81c0dbd8
[ 1525.344851]  ffff88003b06f8b0 ffffffff81662ce0 ffff88003c610240 ffff88003b06f958
[ 1525.350410] Call Trace:
[ 1525.352698]  [<ffffffff81662ce0>] schedule+0x30/0x80
[ 1525.356262]  [<ffffffff81666d17>] schedule_timeout+0x117/0x1c0
[ 1525.360557]  [<ffffffff810dda00>] ? init_timer_key+0x40/0x40
[ 1525.365088]  [<ffffffff81666df9>] schedule_timeout_killable+0x19/0x20
[ 1525.370150]  [<ffffffff811426e0>] out_of_memory+0x210/0x5a0
[ 1525.374496]  [<ffffffff81142794>] ? out_of_memory+0x2c4/0x5a0
[ 1525.378937]  [<ffffffff811485b2>] __alloc_pages_nodemask+0xbe2/0xd90
[ 1525.383811]  [<ffffffff811934d6>] alloc_pages_vma+0xb6/0x290
[ 1525.388206]  [<ffffffff81170af8>] handle_mm_fault+0x12d8/0x16f0
[ 1525.392782]  [<ffffffff8116f868>] ? handle_mm_fault+0x48/0x16f0
[ 1525.397338]  [<ffffffff8105c089>] ? __do_page_fault+0x129/0x4f0
[ 1525.401933]  [<ffffffff8105c127>] __do_page_fault+0x1c7/0x4f0
[ 1525.406370]  [<ffffffff8105c480>] do_page_fault+0x30/0x80
[ 1525.410542]  [<ffffffff81669668>] page_fault+0x28/0x30
[ 1525.414698]  [<ffffffff81321f0d>] ? __clear_user+0x3d/0x70
[ 1525.418921]  [<ffffffff81321eee>] ? __clear_user+0x1e/0x70
[ 1525.423132]  [<ffffffff81326a98>] iov_iter_zero+0x68/0x250
[ 1525.427384]  [<ffffffff81400ed8>] read_iter_zero+0x38/0xa0
[ 1525.431592]  [<ffffffff811bd934>] __vfs_read+0xc4/0xf0
[ 1525.435548]  [<ffffffff811be49a>] vfs_read+0x7a/0x120
[ 1525.439440]  [<ffffffff811bed43>] SyS_read+0x53/0xd0
[ 1525.443259]  [<ffffffff8100364d>] do_syscall_64+0x5d/0x180
[ 1525.447420]  [<ffffffff81667bff>] entry_SYSCALL64_slow_path+0x25/0x25
(...snipped...)
[ 1560.429708] INFO: task kworker/3:1:46 blocked for more than 120 seconds.
[ 1560.435640]       Not tainted 4.5.0-rc6-next-20160301 #89
[ 1560.440208] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1560.446115] kworker/3:1     D ffff880039217ca8     0    46      2 0x00000000
[ 1560.451572] Workqueue: events moom_callback
[ 1560.455045]  ffff880039217ca8 ffff88003be120c0 ffff880039212000 ffff880039218000
[ 1560.460769]  ffffffff81c7a568 0000000000000246 ffff880039212000 00000000ffffffff
[ 1560.466530]  ffff880039217cc0 ffffffff81662ce0 ffffffff81c7a560 ffff880039217cd0
[ 1560.472363] Call Trace:
[ 1560.474603]  [<ffffffff81662ce0>] schedule+0x30/0x80
[ 1560.478892]  [<ffffffff81662fe9>] schedule_preempt_disabled+0x9/0x10
[ 1560.483713]  [<ffffffff81664b9f>] mutex_lock_nested+0x14f/0x3a0
[ 1560.488398]  [<ffffffff813e3f6e>] ? moom_callback+0x6e/0xb0
[ 1560.492703]  [<ffffffff813e3f6e>] moom_callback+0x6e/0xb0
[ 1560.496938]  [<ffffffff8108a955>] process_one_work+0x1a5/0x400
[ 1560.501391]  [<ffffffff8108a8f1>] ? process_one_work+0x141/0x400
[ 1560.505944]  [<ffffffff8108acd6>] worker_thread+0x126/0x490
[ 1560.510173]  [<ffffffff816625a4>] ? __schedule+0x314/0xa20
[ 1560.514330]  [<ffffffff8108abb0>] ? process_one_work+0x400/0x400
[ 1560.518899]  [<ffffffff81090c9e>] kthread+0xee/0x110
[ 1560.522760]  [<ffffffff81667d72>] ret_from_fork+0x22/0x50
[ 1560.526923]  [<ffffffff81090bb0>] ? kthread_create_on_node+0x230/0x230
[ 1560.531792] 3 locks held by kworker/3:1/46:
[ 1560.535086]  #0:  ("events"){.+.+.+}, at: [<ffffffff8108a8f1>] process_one_work+0x141/0x400
[ 1560.541351]  #1:  (moom_work){+.+...}, at: [<ffffffff8108a8f1>] process_one_work+0x141/0x400
[ 1560.547626]  #2:  (oom_lock){+.+...}, at: [<ffffffff813e3f6e>] moom_callback+0x6e/0xb0
[ 1582.487749] sysrq: SysRq : Kill All Tasks
[ 1582.530799] kworker/3:1 invoked oom-killer: gfp_mask=0x24000c0(GFP_KERNEL), order=-1, oom_score_adj=0
[ 1582.538355] kworker/3:1 cpuset=/ mems_allowed=0
[ 1582.570304] CPU: 3 PID: 46 Comm: kworker/3:1 Not tainted 4.5.0-rc6-next-20160301 #89
---------- console log end ----------
Complete log is at http://I-love.SAKURA.ne.jp/tmp/serial-20160303.txt.xz .

------------------------------------------------------------
>From 92d4ec39ed23c6d0d5785f4f53311d55dfe480de Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx>
Date: Thu, 3 Mar 2016 13:27:06 +0900
Subject: [PATCH] mm,oom: Do not sleep with oom_lock held.

out_of_memory() can stall effectively forever if a SCHED_IDLE thread
called out_of_memory() when there are !SCHED_IDLE threads running on
the same CPU, for schedule_timeout_killable(1) cannot return shortly
due to scheduling priority.

Operations with oom_lock held should complete as soon as possible
because we might be preserving OOM condition for most of that period
if we are in OOM condition. SysRq-f can't work if oom_lock is held.

It would be possible to boost scheduling priority of current thread
while holding oom_lock, but priority of current thread might be
manipulated by other threads after boosting. Unless we offload
operations with oom_lock held to a dedicated kernel thread with high
priority, addressing this problem using priority manipulation is racy.

This patch brings schedule_timeout_killable(1) out of oom_lock.

Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx>
---
 mm/oom_kill.c   |  8 +-------
 mm/page_alloc.c | 34 +++++++++++++++++++++++-----------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5d5eca9..dbef3a7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -901,15 +901,9 @@ bool out_of_memory(struct oom_control *oc)
 		dump_header(oc, NULL, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
-	if (p && p != (void *)-1UL) {
+	if (p && p != (void *)-1UL)
 		oom_kill_process(oc, p, points, totalpages, NULL,
 				 "Out of memory");
-		/*
-		 * Give the killed process a good chance to exit before trying
-		 * to allocate memory again.
-		 */
-		schedule_timeout_killable(1);
-	}
 	return true;
 }

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1993894..cfe0997 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2871,20 +2871,32 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
-	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+	if (out_of_memory(&oc)) {
+		mutex_unlock(&oom_lock);
+		*did_some_progress = 1;
+		/*
+		 * Give the killed process a good chance to exit before trying
+		 * to allocate memory again. We should sleep after releasing
+		 * oom_lock because current thread might be SCHED_IDLE priority
+		 * which can sleep for minutes when preempted by other threads
+		 * with !SCHED_IDLE priority running on the same CPU.
+		 */
+		schedule_timeout_killable(1);
+		return NULL;
+	}
+	if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
 		*did_some_progress = 1;

-		if (gfp_mask & __GFP_NOFAIL) {
+		page = get_page_from_freelist(gfp_mask, order,
+					      ALLOC_NO_WATERMARKS|ALLOC_CPUSET,
+					      ac);
+		/*
+		 * fallback to ignore cpuset restriction if our nodes
+		 * are depleted
+		 */
+		if (!page)
 			page = get_page_from_freelist(gfp_mask, order,
-					ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
-			/*
-			 * fallback to ignore cpuset restriction if our nodes
-			 * are depleted
-			 */
-			if (!page)
-				page = get_page_from_freelist(gfp_mask, order,
-					ALLOC_NO_WATERMARKS, ac);
-		}
+						      ALLOC_NO_WATERMARKS, ac);
 	}
 out:
 	mutex_unlock(&oom_lock);
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>



[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]