Re: How to avoid printk() delay caused by cond_resched() ?

Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> · Thu, 3 Mar 2016 00:02:01 +0900

Sergey Senozhatsky wrote:
> Hello Tetsuo,
> 
> On (03/02/16 21:01), Tetsuo Handa wrote:
> > I'm trying to dump information of all threads which might be relevant
> > to stalling inside memory allocator. But it seems to me that since this
> > patch changed to allow calling cond_resched() from printk() if it is
> > safe to do so, it is now possible that the thread which invoked the OOM
> > killer can sleep for minutes with the oom_lock mutex held when my dump is
> > in progress. I want to release oom_lock mutex as soon as possible so
> > that other threads can call out_of_memory() to get TIF_MEMDIE and exit
> > their allocations.
> 
> correct, otherwise chances are that you will softlockup or RCU stall
> the system -- console_unlock() does not stop until the printk logbuf
> has data in it (which can be appended concurrently from other CPUs).

Did you mean:

  console_unlock() does not stop until the printk logbuf has no more data
  in it (but data can be appended to the logbuf concurrently from other CPUs).

> this fact was the motivation for cond_resched() in console_unlock() in
> the first place and for the later patch that permit to some of printk
> callers to do the same.
> 
> > So, how can I prevent printk() triggered by out_of_memory() from sleeping
> > for minutes with oom_lock mutex held? Guard it with preempt_disable() /
> > preempt_enable() ? Guard it with rcu_read_lock() / rcu_read_unlock() ? 
> 
> correct, both preempt_disable() and rcu_read_lock() will certainly forbid
> cond_reshed() in printk()->console_unlock(), but it will also increase the
> softlockup or/and rcu stall probability, depending on how many threads you
> have and how many stack traces you need to print. can we do dirty things to
> the frequency of cond_resched()? for example, do cond_resched() not after
> every printk(), but every LOG_LINE_MAX chars.
> 

I think the caller of printk() should handle it. If an API for waiting for
the logbuf to be flushed exists, I'm happy to use it.

> several questions,
> do you use some sort of oom-kill reproducer? ... the problem with printk
> is that you never know which once of the CPUs will do the printing job at
> the end; any guess there is highly unreliable. but, assuming that you have
> executed that oom-kill reproducer many times before the patch in question
> do you have any rough numbers to compare how many seconds it used to take
> to dump_stack all of the tasks?

Yes, I'm using a stress tester which creates thousands of threads
(shown below). My dump (shown bottom) is intended for finding bugs
in corner cases and is written to wait for a bit for each thread
so that my dump shall not hold RCU lock for too long even if there
are thousands of threads to dump.

(This is just an example. There are many different versions.)
----------------------------------------
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>
#include <sched.h>
#include <sys/prctl.h>

static char stop = 0;
static int fd = EOF;

static void sigcld_handler(int unused)
{
	stop = 1;
}

static int memory_eater(void *unused)
{
	char *buf = NULL;
	unsigned long size = 0;
	char c;
	read(fd, &c, 1);
	while (1) {
		char *tmp = realloc(buf, size + 4096);
		if (!tmp)
			break;
		buf = tmp;
		buf[size] = 0;
		size += 4096;
	}
	pause();
	return 0;
}

int main(int argc, char *argv[])
{
	int pipe_fd[2] = { EOF, EOF };
	char *buf = NULL;
	unsigned long size;
	int i;
	if (chdir("/tmp"))
		return 1;
	if (pipe(pipe_fd))
		return 1;
	fd = pipe_fd[0];
	signal(SIGCLD, sigcld_handler);
	for (i = 0; i < 1024; i++) {
		if (fork() == 0) {
			char *stack = malloc(4096);
			char from[128] = { };
			char to[128] = { };
			const pid_t pid = getpid();
			unsigned char prev = 0;
			int fd = open("/proc/self/oom_score_adj", O_WRONLY);
			write(fd, "1000", 4);
			close(fd);
			close(pipe_fd[1]);
			snprintf(from, sizeof(from), "tgid=%u", pid);
			prctl(PR_SET_NAME, (unsigned long) from, 0, 0, 0);
			srand(pid);
			sleep(2);
			snprintf(from, sizeof(from), "file.%u-0", pid);
			fd = open(from, O_WRONLY | O_CREAT, 0600);
			if (fd == EOF)
				_exit(1);
			if (clone(memory_eater, stack + 4096, CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, NULL) == -1)
				_exit(1);
			while (1) {
				const unsigned char next = rand();
				snprintf(from, sizeof(from), "file.%u-%u", pid, prev);
				snprintf(to, sizeof(to), "file.%u-%u", pid, next);
				prev = next;
				rename(from, to);
				write(fd, "", 1);
			}
			_exit(0);
		}
	}
	close(pipe_fd[0]);
	for (size = 1048576; size < 512UL * (1 << 30); size <<= 1) {
		char *cp = realloc(buf, size);
		if (!cp) {
			size >>= 1;
			break;
		}
		buf = cp;
	}
	sleep(4);
	close(pipe_fd[1]);
	/* Will cause OOM due to overcommit */
	for (i = 0; i < size; i += 4096) {
		buf[i] = 0;
		if (stop)
			break;
	}
	pause();
	return 0;
}
----------------------------------------

(This is today's version. I'm trying to somehow avoid
"** XXX printk messages dropped **" messages.)
----------------------------------------

diff --git a/include/linux/console.h b/include/linux/console.h
index ea731af..11e936c 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -147,6 +147,7 @@ extern int unregister_console(struct console *);
 extern struct console *console_drivers;
 extern void console_lock(void);
 extern int console_trylock(void);
+extern void wait_console_flushed(unsigned long timeout);
 extern void console_unlock(void);
 extern void console_conditional_schedule(void);
 extern void console_unblank(void);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 45993b8..5647c5a 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -113,6 +113,16 @@ static inline bool task_will_free_mem(struct task_struct *task)
 		!(task->signal->flags & SIGNAL_GROUP_COREDUMP);
 }
 
+extern unsigned int out_of_memory_count;
+extern unsigned int no_out_of_memory_count;
+
+static inline void set_memalloc_location(const u8 location)
+{
+#ifdef CONFIG_DETECT_MEMALLOC_STALL_TASK
+	current->memalloc.known_locations = location;
+#endif
+}
+
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0b44fbc..ae21ab2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1393,6 +1393,31 @@ struct tlbflush_unmap_batch {
 	bool writable;
 };
 
+struct memalloc_info {
+	/* For locking and progress monitoring. */
+	unsigned int sequence;
+	/*
+	 * 0: not doing __GFP_RECLAIM allocation.
+	 * 1: doing non-recursive __GFP_RECLAIM allocation.
+	 * 2: doing recursive __GFP_RECLAIM allocation.
+	 */
+	u8 valid;
+	/*
+	 * bit 0: Will be reported as OOM victim.
+	 * bit 1: Will be reported as dying task.
+	 * bit 2: Will be reported as stalling task.
+	 * bit 3: Will be reported as exiting task.
+	 * bit 7: Will be reported unconditionally.
+	 */
+	u8 type;
+	u8 known_locations; /* to reduce amount of traces */
+	/* Started time in jiffies as of valid == 1. */
+	unsigned long start;
+	/* Requested order and gfp flags as of valid == 1. */
+	unsigned int order;
+	gfp_t gfp;
+};
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1855,6 +1880,9 @@ struct task_struct {
 #ifdef CONFIG_MMU
 	struct list_head oom_reaper_list;
 #endif
+#ifdef CONFIG_DETECT_MEMALLOC_STALL_TASK
+	struct memalloc_info memalloc;
+#endif
 /* CPU-specific state of this task */
 	struct thread_struct thread;
 /*
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 22db1e6..7f2c230 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -9,6 +9,9 @@ extern int sysctl_hung_task_warnings;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_DETECT_MEMALLOC_STALL_TASK
+extern unsigned long sysctl_memalloc_task_timeout_secs;
+#endif
 #else
 /* Avoid need for ifdefs elsewhere in the code */
 enum { sysctl_hung_task_timeout_secs = 0 };
diff --git a/kernel/fork.c b/kernel/fork.c
index d277e83..e7789ef 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1425,6 +1425,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->sequential_io_avg	= 0;
 #endif
 
+#ifdef CONFIG_DETECT_MEMALLOC_STALL_TASK
+	p->memalloc.sequence = 0;
+#endif
+
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
 	if (retval)
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d234022..13ad212 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -16,6 +16,8 @@
 #include <linux/export.h>
 #include <linux/sysctl.h>
 #include <linux/utsname.h>
+#include <linux/oom.h> /* out_of_memory_count */
+#include <linux/console.h> /* wait_console_flushed() */
 #include <trace/events/sched.h>
 
 /*
@@ -72,6 +74,248 @@ static struct notifier_block panic_block = {
 	.notifier_call = hung_task_panic,
 };
 
+#ifdef CONFIG_DETECT_MEMALLOC_STALL_TASK
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_memalloc_task_timeout_secs =
+	CONFIG_DEFAULT_MEMALLOC_TASK_TIMEOUT;
+static struct memalloc_info memalloc; /* Filled by is_stalling_task(). */
+
+static long memalloc_timeout_jiffies(unsigned long last_checked, long timeout)
+{
+	struct task_struct *g, *p;
+	long t;
+	unsigned long delta;
+
+	/* timeout of 0 will disable the watchdog */
+	if (!timeout)
+		return MAX_SCHEDULE_TIMEOUT;
+	/* At least wait for timeout duration. */
+	t = last_checked - jiffies + timeout * HZ;
+	if (t > 0)
+		return t;
+	/* Calculate how long to wait more. */
+	t = timeout * HZ;
+	delta = t - jiffies;
+
+	/*
+	 * We might see outdated values in "struct memalloc_info" here.
+	 * We will recheck later using is_stalling_task().
+	 */
+	preempt_disable();
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		if (likely(!p->memalloc.valid))
+			continue;
+		t = min_t(long, t, p->memalloc.start + delta);
+		if (unlikely(t <= 0))
+			goto stalling;
+	}
+ stalling:
+	rcu_read_unlock();
+	preempt_enable();
+	return t;
+}
+
+/**
+ * is_stalling_task - Check and copy a task's memalloc variable.
+ *
+ * @task:   A task to check.
+ * @expire: Timeout in jiffies.
+ *
+ * Returns true if a task is stalling, false otherwise.
+ */
+static bool is_stalling_task(const struct task_struct *task,
+			     const unsigned long expire)
+{
+	const struct memalloc_info *m = &task->memalloc;
+
+	/*
+	 * If start_memalloc_timer() is updating "struct memalloc_info" now,
+	 * we can ignore it because timeout jiffies cannot be expired as soon
+	 * as updating it completes.
+	 */
+	if (!m->valid || (m->sequence & 1))
+		return false;
+	smp_rmb(); /* Block start_memalloc_timer(). */
+	memalloc.start = m->start;
+	memalloc.order = m->order;
+	memalloc.gfp = m->gfp;
+	smp_rmb(); /* Unblock start_memalloc_timer(). */
+	memalloc.sequence = m->sequence;
+	/*
+	 * If start_memalloc_timer() started updating it while we read it,
+	 * we can ignore it for the same reason.
+	 */
+	if (!m->valid || (memalloc.sequence & 1))
+		return false;
+	/* This is a valid "struct memalloc_info". Check for timeout. */
+	return time_after_eq(expire, memalloc.start);
+}
+
+/* Check for memory allocation stalls. */
+static void check_memalloc_stalling_tasks(unsigned long timeout)
+{
+	char buf[256];
+	struct task_struct *g, *p;
+	unsigned long now;
+	unsigned long expire;
+	unsigned int sigkill_pending;
+	unsigned int exiting_tasks;
+	unsigned int memdie_pending;
+	unsigned int stalling_tasks;
+
+	cond_resched();
+	now = jiffies;
+	/*
+	 * Report tasks that stalled for more than half of timeout duration
+	 * because such tasks might be correlated with tasks that already
+	 * stalled for full timeout duration.
+	 */
+	expire = now - timeout * (HZ / 2);
+	/* Count stalling tasks, dying and victim tasks. */
+	sigkill_pending = 0;
+	exiting_tasks = 0;
+	memdie_pending = 0;
+	stalling_tasks = 0;
+	preempt_disable();
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		u8 type = 0;
+
+		if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
+			type |= 1;
+			memdie_pending++;
+		}
+		if (fatal_signal_pending(p)) {
+			type |= 2;
+			sigkill_pending++;
+		}
+		if ((p->flags & PF_EXITING) && p->state != TASK_DEAD) {
+			type |= 8;
+			exiting_tasks++;
+		}
+		if (is_stalling_task(p, expire)) {
+			type |= 4;
+			stalling_tasks++;
+		}
+		if (p->flags & PF_KSWAPD)
+			type |= 128;
+		p->memalloc.type = type;
+	}
+	rcu_read_unlock();
+	preempt_enable_no_resched();
+	if (!stalling_tasks)
+		return;
+	wait_console_flushed(HZ);
+	cond_resched();
+	/* Report stalling tasks, dying and victim tasks. */
+	pr_warn("MemAlloc-Info: stalling=%u dying=%u exiting=%u victim=%u oom_count=%u/%u\n",
+		stalling_tasks, sigkill_pending, exiting_tasks, memdie_pending, out_of_memory_count,
+		no_out_of_memory_count);
+	cond_resched();
+	sigkill_pending = 0;
+	exiting_tasks = 0;
+	memdie_pending = 0;
+	stalling_tasks = 0;
+	preempt_disable();
+	rcu_read_lock();
+ restart_report:
+	for_each_process_thread(g, p) {
+		bool can_cont;
+		u8 type;
+
+		if (likely(!p->memalloc.type))
+			continue;
+		p->memalloc.type = 0;
+		/* Recheck in case state changed meanwhile. */
+		type = 0;
+		if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
+			type |= 1;
+			memdie_pending++;
+		}
+		if (fatal_signal_pending(p)) {
+			type |= 2;
+			sigkill_pending++;
+		}
+		if ((p->flags & PF_EXITING) && p->state != TASK_DEAD) {
+			type |= 8;
+			exiting_tasks++;
+		}
+		if (is_stalling_task(p, expire)) {
+			type |= 4;
+			stalling_tasks++;
+			snprintf(buf, sizeof(buf),
+				 " seq=%u gfp=0x%x(%pGg) order=%u delay=%lu",
+				 memalloc.sequence >> 1, memalloc.gfp, &memalloc.gfp,
+				 memalloc.order, now - memalloc.start);
+		} else {
+			buf[0] = '\0';
+		}
+		if (p->flags & PF_KSWAPD)
+			type |= 128;
+		if (unlikely(!type))
+			continue;
+		/*
+		 * Victim tasks get pending SIGKILL removed before arriving at
+		 * do_exit(). Therefore, print " exiting" instead for " dying".
+		 */
+		pr_warn("MemAlloc: %s(%u) flags=0x%x switches=%lu%s%s%s%s%s\n", p->comm,
+			p->pid, p->flags, p->nvcsw + p->nivcsw, buf,
+			(p->state & TASK_UNINTERRUPTIBLE) ?
+			" uninterruptible" : "",
+			(type & 8) ? " exiting" : "",
+			(type & 2) ? " dying" : "",
+			(type & 1) ? " victim" : "");
+		switch (p->memalloc.known_locations) {
+		case 1:
+			pr_warn("Call Trace: looping inside shrink_inactive_list()\n");
+			break;
+		case 2:
+			pr_warn("Call Trace: waiting for oom_lock\n");
+			break;
+		default:
+			sched_show_task(p);
+		}
+		/*
+		 * Since there could be thousands of tasks to report, we always
+		 * call cond_resched() after each report, in order to avoid RCU
+		 * stalls.
+		 *
+		 * Since not yet reported tasks have p->memalloc.type > 0, we
+		 * can simply restart this loop in case "g" or "p" went away.
+		 */
+		get_task_struct(g);
+		get_task_struct(p);
+		rcu_read_unlock();
+		preempt_enable_no_resched();
+		wait_console_flushed(HZ / 10);
+		cond_resched();
+		preempt_disable();
+		rcu_read_lock();
+		can_cont = pid_alive(g) && pid_alive(p);
+		put_task_struct(p);
+		put_task_struct(g);
+		if (!can_cont)
+			goto restart_report;
+	}
+	rcu_read_unlock();
+	preempt_enable_no_resched();
+	cond_resched();
+	/* Show memory information. (SysRq-m) */
+	show_mem(0);
+	/* Show workqueue state. */
+	show_workqueue_state();
+	/* Show lock information. (SysRq-d) */
+	debug_show_all_locks();
+	wait_console_flushed(HZ / 10);
+	pr_warn("MemAlloc-Info: stalling=%u dying=%u exiting=%u victim=%u oom_count=%u/%u\n",
+		stalling_tasks, sigkill_pending, exiting_tasks, memdie_pending, out_of_memory_count,
+		no_out_of_memory_count);
+}
+#endif /* CONFIG_DETECT_MEMALLOC_STALL_TASK */
+
 static void check_hung_task(struct task_struct *t, unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
@@ -227,20 +471,35 @@ EXPORT_SYMBOL_GPL(reset_hung_task_detector);
 static int watchdog(void *dummy)
 {
 	unsigned long hung_last_checked = jiffies;
+#ifdef CONFIG_DETECT_MEMALLOC_STALL_TASK
+	unsigned long stall_last_checked = hung_last_checked;
+#endif
 
 	set_user_nice(current, 0);
 
 	for ( ; ; ) {
 		unsigned long timeout = sysctl_hung_task_timeout_secs;
 		long t = hung_timeout_jiffies(hung_last_checked, timeout);
-
+#ifdef CONFIG_DETECT_MEMALLOC_STALL_TASK
+		unsigned long timeout2 = sysctl_memalloc_task_timeout_secs;
+		long t2 = memalloc_timeout_jiffies(stall_last_checked,
+						   timeout2);
+
+		if (t2 <= 0) {
+			check_memalloc_stalling_tasks(timeout2);
+			stall_last_checked = jiffies;
+			continue;
+		}
+#else
+		long t2 = t;
+#endif
 		if (t <= 0) {
 			if (!atomic_xchg(&reset_hung_task, 0))
 				check_hung_uninterruptible_tasks(timeout);
 			hung_last_checked = jiffies;
 			continue;
 		}
-		schedule_timeout_interruptible(t);
+		schedule_timeout_interruptible(min(t, t2));
 	}
 
 	return 0;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9917f69..2eb60df 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -121,6 +121,15 @@ static int __down_trylock_console_sem(unsigned long ip)
 	up(&console_sem);\
 } while (0)
 
+static int __down_timeout_console_sem(unsigned long timeout, unsigned long ip)
+{
+	if (down_timeout(&console_sem, timeout))
+		return 1;
+	mutex_acquire(&console_lock_dep_map, 0, 1, ip);
+	return 0;
+}
+#define down_timeout_console_sem(timeout) __down_timeout_console_sem((timeout), _RET_IP_)
+
 /*
  * This is used for debugging the mess that is the VT code by
  * keeping track if we have the console semaphore held. It's
@@ -2125,6 +2134,21 @@ int console_trylock(void)
 }
 EXPORT_SYMBOL(console_trylock);
 
+void wait_console_flushed(unsigned long timeout)
+{
+	might_sleep();
+
+	if (down_timeout_console_sem(timeout))
+		return;
+	if (console_suspended) {
+		up_console_sem();
+		return;
+	}
+	console_locked = 1;
+	console_may_schedule = 1;
+	console_unlock();
+}
+
 int is_console_locked(void)
 {
 	return console_locked;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 96ec234..8bc1c5b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1073,6 +1073,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &neg_one,
 	},
+#ifdef CONFIG_DETECT_MEMALLOC_STALL_TASK
+	{
+		.procname	= "memalloc_task_timeout_secs",
+		.data		= &sysctl_memalloc_task_timeout_secs,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_dohung_task_timeout_secs,
+		.extra2		= &hung_task_timeout_max,
+	},
+#endif
 #endif
 #ifdef CONFIG_COMPAT
 	{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4a8bfe2..8bbc655 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -864,6 +864,30 @@ config WQ_WATCHDOG
 	  state.  This can be configured through kernel parameter
 	  "workqueue.watchdog_thresh" and its sysfs counterpart.
 
+config DETECT_MEMALLOC_STALL_TASK
+	bool "Detect tasks stalling inside memory allocator"
+	default n
+	depends on DETECT_HUNG_TASK
+	help
+	  This option emits warning messages and traces when memory
+	  allocation requests are stalling, in order to catch unexplained
+	  hangups/reboots caused by memory allocation stalls.
+
+config DEFAULT_MEMALLOC_TASK_TIMEOUT
+	int "Default timeout for stalling task detection (in seconds)"
+	depends on DETECT_MEMALLOC_STALL_TASK
+	default 10
+	help
+	  This option controls the default timeout (in seconds) used
+	  to determine when a task has become non-responsive and should
+	  be considered stalling inside memory allocator.
+
+	  It can be adjusted at runtime via the kernel.memalloc_task_timeout_secs
+	  sysctl or by writing a value to
+	  /proc/sys/kernel/memalloc_task_timeout_secs.
+
+	  A timeout of 0 disables the check. The default is 10 seconds.
+
 endmenu # "Debug lockups and hangs"
 
 config PANIC_ON_OOPS
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5d5eca9..bb5ea86 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/oom.h>
 
+unsigned int out_of_memory_count;
+
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
@@ -855,6 +857,7 @@ bool out_of_memory(struct oom_control *oc)
 	unsigned int uninitialized_var(points);
 	enum oom_constraint constraint = CONSTRAINT_NONE;
 
+	out_of_memory_count++;
 	if (oom_killer_disabled)
 		return false;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1993894..1529ccf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2806,6 +2806,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
 		show_mem(filter);
 }
 
+unsigned int no_out_of_memory_count;
+
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	const struct alloc_context *ac, unsigned long *did_some_progress)
@@ -2826,7 +2828,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	 */
 	if (!mutex_trylock(&oom_lock)) {
 		*did_some_progress = 1;
+		set_memalloc_location(2);
 		schedule_timeout_uninterruptible(1);
+		set_memalloc_location(0);
 		return NULL;
 	}
 
@@ -2862,6 +2866,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			 * enter a quiescent state during suspend.
 			 */
 			*did_some_progress = !oom_killer_disabled;
+			no_out_of_memory_count++;
 			goto out;
 		}
 		if (pm_suspended_storage())
@@ -3399,6 +3404,37 @@ got_pg:
 	return page;
 }
 
+#ifdef CONFIG_DETECT_MEMALLOC_STALL_TASK
+static void start_memalloc_timer(const gfp_t gfp_mask, const int order)
+{
+	struct memalloc_info *m = &current->memalloc;
+
+	/* We don't check for stalls for !__GFP_RECLAIM allocations. */
+	if (!(gfp_mask & __GFP_RECLAIM))
+		return;
+	/* We don't check for stalls for nested __GFP_RECLAIM allocations */
+	if (!m->valid) {
+		m->sequence++;
+		smp_wmb(); /* Block is_stalling_task(). */
+		m->start = jiffies;
+		m->order = order;
+		m->gfp = gfp_mask;
+		smp_wmb(); /* Unblock is_stalling_task(). */
+		m->sequence++;
+	}
+	m->valid++;
+}
+
+static void stop_memalloc_timer(const gfp_t gfp_mask)
+{
+	if (gfp_mask & __GFP_RECLAIM)
+		current->memalloc.valid--;
+}
+#else
+#define start_memalloc_timer(gfp_mask, order) do { } while (0)
+#define stop_memalloc_timer(gfp_mask) do { } while (0)
+#endif
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -3466,7 +3502,9 @@ retry_cpuset:
 		alloc_mask = memalloc_noio_flags(gfp_mask);
 		ac.spread_dirty_pages = false;
 
+		start_memalloc_timer(alloc_mask, order);
 		page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+		stop_memalloc_timer(alloc_mask);
 	}
 
 	if (kmemcheck_enabled && page)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 39e90e2..1231dcf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1581,14 +1581,31 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	int file = is_file_lru(lru);
 	struct zone *zone = lruvec_zone(lruvec);
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+	unsigned char counter = 0;
 
+	set_memalloc_location(1);
 	while (unlikely(too_many_isolated(zone, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
 
 		/* We are about to die and free our memory. Return now. */
-		if (fatal_signal_pending(current))
+		if (fatal_signal_pending(current)) {
+			set_memalloc_location(0);
 			return SWAP_CLUSTER_MAX;
+		}
+		if (!++counter) {
+			if (file)
+				printk(KERN_WARNING "zone=%s NR_INACTIVE_FILE=%lu NR_ISOLATED_FILE=%lu\n",
+				       zone->name,
+				       zone_page_state(zone, NR_INACTIVE_FILE),
+				       zone_page_state(zone, NR_ISOLATED_FILE));
+			else
+				printk(KERN_WARNING "zone=%s NR_INACTIVE_ANON=%lu NR_ISOLATED_ANON=%lu\n",
+				       zone->name,
+				       zone_page_state(zone, NR_INACTIVE_ANON),
+				       zone_page_state(zone, NR_ISOLATED_ANON));
+		}
 	}
+	set_memalloc_location(0);
 
 	lru_add_drain();
 
----------------------------------------

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>