Re: memory reclaim problems on fs usage

Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> · Thu, 12 Nov 2015 07:19:13 +0900

Arkadiusz Mi?kiewicz wrote:
> This patch is against which tree? (tried 4.1, 4.2 and 4.3)

Oops. Whitespace-damaged. This patch is for vanilla 4.1.2.
Reposting with one condition corrected.

  if (!dump_target_pid) => if (dump_target_pid <= 0)

---
 fs/xfs/kmem.c          |  10 ++-
 fs/xfs/xfs_buf.c       |   3 +-
 include/linux/mmzone.h |   1 +
 include/linux/vmstat.h |   1 +
 mm/page_alloc.c        | 217 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c            |  22 +++++
 6 files changed, 249 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index a7a3a63..535c136 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -55,8 +55,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
 			return ptr;
 		if (!(++retries % 100))
 			xfs_err(NULL,
-		"possible memory allocation deadlock in %s (mode:0x%x)",
-					__func__, lflags);
+		"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+				current->comm, current->pid,
+				__func__, lflags);
 		congestion_wait(BLK_RW_ASYNC, HZ/50);
 	} while (1);
 }
@@ -120,8 +121,9 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
 			return ptr;
 		if (!(++retries % 100))
 			xfs_err(NULL,
-		"possible memory allocation deadlock in %s (mode:0x%x)",
-					__func__, lflags);
+		"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+				current->comm, current->pid,
+				__func__, lflags);
 		congestion_wait(BLK_RW_ASYNC, HZ/50);
 	} while (1);
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1790b00..16322cb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -354,7 +354,8 @@ retry:
 			 */
 			if (!(++retries % 100))
 				xfs_err(NULL,
-		"possible memory allocation deadlock in %s (mode:0x%x)",
+		"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+					current->comm, current->pid,
 					__func__, gfp_mask);
 
 			XFS_STATS_INC(xb_page_retries);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 54d74f6..932a6d6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -527,6 +527,7 @@ struct zone {
 	ZONE_PADDING(_pad3_)
 	/* Zone statistics */
 	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
+	unsigned long stat_last_updated[NR_VM_ZONE_STAT_ITEMS];
 } ____cacheline_internodealigned_in_smp;
 
 enum zone_flags {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 82e7db7..2488925 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -115,6 +115,7 @@ static inline void zone_page_state_add(long x, struct zone *zone,
 {
 	atomic_long_add(x, &zone->vm_stat[item]);
 	atomic_long_add(x, &vm_stat[item]);
+	zone->stat_last_updated[item] = jiffies;
 }
 
 static inline unsigned long global_page_state(enum zone_stat_item item)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 18490f3..35a46b4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,6 +61,8 @@
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <linux/page_owner.h>
+#include <linux/nmi.h>
+#include <linux/kthread.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -2496,6 +2498,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 }
 #endif /* CONFIG_COMPACTION */
 
+pid_t dump_target_pid;
+
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -2645,6 +2649,208 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
 }
 
+static unsigned long kmallocwd_timeout = 10 * HZ; /* Scan interval. */
+static u8 memalloc_counter_active_index; /* Either 0 or 1. */
+static int memalloc_counter[2]; /* Number of tasks doing memory allocation. */
+
+struct memalloc {
+	struct list_head list; /* Connected to memalloc_list. */
+	struct task_struct *task; /* Iniatilized to current. */
+	unsigned long start; /* Initialized to jiffies. */
+	unsigned int order;
+	gfp_t gfp;
+	u8 index; /* Initialized to memalloc_counter_active_index. */
+	u8 dumped;
+};
+
+static LIST_HEAD(memalloc_list); /* List of "struct memalloc".*/
+static DEFINE_SPINLOCK(memalloc_list_lock); /* Lock for memalloc_list. */
+
+/*
+ * kmallocwd - A kernel thread for monitoring memory allocation stalls.
+ *
+ * @unused: Not used.
+ *
+ * This kernel thread does not terminate.
+ */
+static int kmallocwd(void *unused)
+{
+	struct memalloc *m;
+	struct task_struct *g, *p;
+	unsigned long now;
+	unsigned int sigkill_pending;
+	unsigned int memdie_pending;
+	unsigned int stalling_tasks;
+	u8 index;
+	pid_t pid;
+
+ not_stalling: /* Healty case. */
+	/* Switch active counter and wait for timeout duration. */
+	index = memalloc_counter_active_index;
+	spin_lock(&memalloc_list_lock);
+	memalloc_counter_active_index ^= 1;
+	spin_unlock(&memalloc_list_lock);
+	schedule_timeout_interruptible(kmallocwd_timeout);
+	/*
+	 * If memory allocations are working, the counter should remain 0
+	 * because tasks will be able to call both start_memalloc_timer()
+	 * and stop_memalloc_timer() within timeout duration.
+	 */
+	if (likely(!memalloc_counter[index]))
+		goto not_stalling;
+ maybe_stalling: /* Maybe something is wrong. Let's check. */
+	now = jiffies;
+	/* Count stalling tasks, dying and victim tasks. */
+	sigkill_pending = 0;
+	memdie_pending = 0;
+	stalling_tasks = 0;
+	pid = 0;
+	spin_lock(&memalloc_list_lock);
+	list_for_each_entry(m, &memalloc_list, list) {
+		if (time_after(now - m->start, kmallocwd_timeout))
+			stalling_tasks++;
+	}
+	spin_unlock(&memalloc_list_lock);
+	preempt_disable();
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		if (test_tsk_thread_flag(p, TIF_MEMDIE))
+			memdie_pending++;
+		if (fatal_signal_pending(p))
+			sigkill_pending++;
+	}
+	rcu_read_unlock();
+	preempt_enable();
+	cond_resched();
+	pr_warn("MemAlloc-Info: %u stalling task, %u dying task, %u victim task.\n",
+		stalling_tasks, sigkill_pending, memdie_pending);
+	/* Report stalling tasks, dying and victim tasks. */
+	spin_lock(&memalloc_list_lock);
+	list_for_each_entry(m, &memalloc_list, list) {
+		if (time_before(now - m->start, kmallocwd_timeout))
+			continue;
+		p = m->task;
+		pr_warn("MemAlloc: %s(%u) gfp=0x%x order=%u delay=%lu\n",
+			p->comm, p->pid, m->gfp, m->order, now - m->start);
+	}
+	spin_unlock(&memalloc_list_lock);
+	preempt_disable();
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		u8 type = 0;
+
+		if (test_tsk_thread_flag(p, TIF_MEMDIE))
+			type |= 1;
+		if (fatal_signal_pending(p))
+			type |= 2;
+		if (likely(!type))
+			continue;
+		if (p->state & TASK_UNINTERRUPTIBLE)
+			type |= 4;
+		pr_warn("MemAlloc: %s(%u)%s%s%s\n", p->comm, p->pid,
+			(type & 4) ? " uninterruptible" : "",
+			(type & 2) ? " dying" : "",
+			(type & 1) ? " victim" : "");
+	}
+	rcu_read_unlock();
+	preempt_enable();
+	cond_resched();
+	/*
+	 * Show traces of newly reported (or too long) stalling tasks.
+	 *
+	 * Show traces only once per 256 timeouts because their traces
+	 * will likely be the same (e.g. cond_sched() or congestion_wait())
+	 * when they are stalling inside __alloc_pages_slowpath().
+	 */
+	spin_lock(&memalloc_list_lock);
+	list_for_each_entry(m, &memalloc_list, list) {
+		if (time_before(now - m->start, kmallocwd_timeout) ||
+		    m->dumped++)
+			continue;
+		p = m->task;
+		sched_show_task(p);
+		debug_show_held_locks(p);
+		touch_nmi_watchdog();
+		if (!pid)
+			pid = p->pid;
+	}
+	spin_unlock(&memalloc_list_lock);
+	/*
+	 * Show traces of dying tasks (including victim tasks).
+	 *
+	 * Only dying tasks which are in trouble (e.g. blocked at unkillable
+	 * locks held by memory allocating tasks) will be repeatedly shown.
+	 * Therefore, we need to pay attention to tasks repeatedly shown here.
+	 */
+	preempt_disable();
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		if (likely(!fatal_signal_pending(p)))
+			continue;
+		sched_show_task(p);
+		debug_show_held_locks(p);
+		touch_nmi_watchdog();
+	}
+	rcu_read_unlock();
+	preempt_enable();
+	show_workqueue_state();
+	if (dump_target_pid <= 0)
+		dump_target_pid = -pid;
+	/* Wait until next timeout duration. */
+	schedule_timeout_interruptible(kmallocwd_timeout);
+	if (memalloc_counter[index])
+		goto maybe_stalling;
+	goto not_stalling;
+	return 0; /* To suppress "no return statement" compiler warning. */
+}
+
+static int __init start_kmallocwd(void)
+{
+	if (kmallocwd_timeout) {
+		struct task_struct *task = kthread_run(kmallocwd, NULL,
+						       "kmallocwd");
+		BUG_ON(IS_ERR(task));
+	}
+	return 0;
+}
+late_initcall(start_kmallocwd);
+
+static int __init kmallocwd_config(char *str)
+{
+	if (kstrtoul(str, 10, &kmallocwd_timeout) == 0)
+		kmallocwd_timeout = min(kmallocwd_timeout * HZ,
+					(unsigned long) LONG_MAX);
+	return 0;
+}
+__setup("kmallocwd=", kmallocwd_config);
+
+static void start_memalloc_timer(struct memalloc *m, const gfp_t gfp_mask,
+				 const int order)
+{
+	if (!kmallocwd_timeout || m->task)
+		return;
+	m->task = current;
+	m->start = jiffies;
+	m->gfp = gfp_mask;
+	m->order = order;
+	m->dumped = 0;
+	spin_lock(&memalloc_list_lock);
+	m->index = memalloc_counter_active_index;
+	memalloc_counter[m->index]++;
+	list_add_tail(&m->list, &memalloc_list);
+	spin_unlock(&memalloc_list_lock);
+}
+
+static void stop_memalloc_timer(struct memalloc *m)
+{
+	if (!m->task)
+		return;
+	spin_lock(&memalloc_list_lock);
+	memalloc_counter[m->index]--;
+	list_del(&m->list);
+	spin_unlock(&memalloc_list_lock);
+}
+
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 						struct alloc_context *ac)
@@ -2657,6 +2863,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	int contended_compaction = COMPACT_CONTENDED_NONE;
+	struct memalloc m = { .task = NULL };
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2678,6 +2885,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto nopage;
 
 retry:
+	if (dump_target_pid == -current->pid)
+		dump_target_pid = -dump_target_pid;
+
 	if (!(gfp_mask & __GFP_NO_KSWAPD))
 		wake_all_kswapds(order, ac);
 
@@ -2740,6 +2950,8 @@ retry:
 	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
 		goto nopage;
 
+	start_memalloc_timer(&m, gfp_mask, order);
+
 	/*
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
@@ -2798,6 +3010,10 @@ retry:
 		goto got_pg;
 
 	/* Check if we should retry the allocation */
+	if (dump_target_pid == current->pid) {
+		printk(KERN_INFO "did_some_progress=%lu\n", did_some_progress);
+		dump_target_pid = 0;
+	}
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, did_some_progress,
 						pages_reclaimed)) {
@@ -2834,6 +3050,7 @@ retry:
 nopage:
 	warn_alloc_failed(gfp_mask, order, NULL);
 got_pg:
+	stop_memalloc_timer(&m);
 	return page;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1a17bd7..c449371 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2432,6 +2432,8 @@ static inline bool compaction_ready(struct zone *zone, int order)
 	return watermark_ok;
 }
 
+extern pid_t dump_target_pid;
+
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2533,7 +2535,27 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 
 		if (global_reclaim(sc) &&
 		    !reclaimable && zone_reclaimable(zone))
+		{
+			if (dump_target_pid == current->pid) {
+				unsigned long rec = zone_reclaimable_pages(zone);
+				unsigned long free = zone_page_state(zone, NR_FREE_PAGES);
+				unsigned long min = min_wmark_pages(zone);
+				unsigned long scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+				unsigned long now = jiffies;
+				unsigned long rec2 = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
+					zone_page_state_snapshot(zone, NR_INACTIVE_FILE);
+				unsigned long free2 = zone_page_state_snapshot(zone, NR_FREE_PAGES);
+				unsigned long scanned2 = zone_page_state_snapshot(zone, NR_PAGES_SCANNED);
+
+				printk(KERN_INFO "%s zone_reclaimable: reclaim:%lu(%lu,%lu,%ld) free:%lu(%lu,%ld) min:%lu pages_scanned:%lu(%lu,%ld) prio:%d\n",
+				       zone->name, rec, now - zone->stat_last_updated[NR_ACTIVE_FILE],
+				       now - zone->stat_last_updated[NR_INACTIVE_FILE], rec - rec2,
+				       free, now - zone->stat_last_updated[NR_FREE_PAGES], free - free2,
+				       min, scanned, now - zone->stat_last_updated[NR_PAGES_SCANNED],
+				       scanned - scanned2, sc->priority);
+			}
 			reclaimable = true;
+		}
 	}
 
 	/*
-- 
1.8.3.1

_______________________________________________
xfs mailing list
xfs@xxxxxxxxxxx
http://oss.sgi.com/mailman/listinfo/xfs