Re: [memcg] 1fc14cf673: invoked_oom-killer:gfp_mask=0x

Hillf Danton <hdanton@xxxxxxxx> · Sat, 9 Nov 2019 20:19:11 +0800

Hey Rong

On Thu, 7 Nov 2019 17:02:34 +0800 Rong Chen wrote:
> 
> FYI, we noticed the following commit (built with gcc-7):
> 
> commit: 1fc14cf67325190e0075cf3cd5511965499fffb4 ("[RFC v2] memcg: add memcg lru for page reclaiming")
> url: https://github.com/0day-ci/linux/commits/Hillf-Danton/memcg-add-memcg-lru-for-page-reclaiming/20191029-143906
> 
> 
> in testcase: vm-scalability
> with following parameters:
> 
> 	runtime: 300s
> 	test: lru-file-mmap-read
> 	cpufreq_governor: performance
> 	ucode: 0x500002b
> 
> test-description: The motivation behind this suite is to exercise functions and regions of the mm/ of the Linux kernel which are of interest to us.
> test-url: https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git/
> 
> 
> on test machine: 192 threads Intel(R) Xeon(R) Platinum 9242 CPU @ 2.30GHz with 192G memory
> 
> caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
> 
> 
> +--------------------------------------------------+------------+------------+
> |                                                  | 8005803a2c | 1fc14cf673 |
> +--------------------------------------------------+------------+------------+
> | boot_successes                                   | 2          | 4          |
> | boot_failures                                    | 11         |            |
> | WARNING:at_fs/iomap/direct-io.c:#iomap_dio_actor | 10         |            |
> | RIP:iomap_dio_actor                              | 10         |            |
> | BUG:kernel_hang_in_boot_stage                    | 1          |            |
> | last_state.OOM                                   | 0          | 4          |
> +--------------------------------------------------+------------+------------+
> 
> 
> If you fix the issue, kindly add following tag
> Reported-by: kernel test robot <rong.a.chen@xxxxxxxxx>
> 
> 
> 
> user  :notice: [   51.667771] 2019-11-06 23:56:11  ./usemem --runtime 300 -f /tmp/vm-scalability-tmp/vm-scalability/sparse-lru-file-mmap-read-71 --readonly 22906492245
> 
> user  :notice: [   51.697549] 2019-11-06 23:56:11  ./usemem --runtime 300 -f /tmp/vm-scalability-tmp/vm-scalability/sparse-lru-file-mmap-read-72 --readonly 22906492245
> 
> kern  :warn  : [   51.715513] usemem invoked oom-killer: gfp_mask=0x400dc0(GFP_KERNEL_ACCOUNT|__GFP_ZERO), order=0, oom_score_adj=0
> 
> user  :notice: [   51.724161] 2019-11-06 23:56:11  truncate /tmp/vm-scalability-tmp/vm-scalability/sparse-lru-file-mmap-read-73 -s 22906492245
> 
> kern  :warn  : [   51.727992] CPU: 11 PID: 3618 Comm: usemem Not tainted 5.4.0-rc5-00020-g1fc14cf673251 #2
> user  :notice: [   51.744101] 2019-11-06 23:56:11  ./usemem --runtime 300 -f /tmp/vm-scalability-tmp/vm-scalability/sparse-lru-file-mmap-read-73 --readonly 22906492245
> 
> kern  :warn  : [   51.752655] Call Trace:
> kern  :warn  : [   51.752666]  dump_stack+0x5c/0x7b
> user  :notice: [   51.771480] 2019-11-06 23:56:11  truncate /tmp/vm-scalability-tmp/vm-scalability/sparse-lru-file-mmap-read-74 -s 22906492245
> 
> kern  :warn  : [   51.775027]  dump_header+0x4a/0x220
> kern  :warn  : [   51.775029]  oom_kill_process+0xe9/0x130
> kern  :warn  : [   51.775031]  out_of_memory+0x105/0x510
> kern  :warn  : [   51.775037]  __alloc_pages_slowpath+0xa3f/0xdb0
> kern  :warn  : [   51.775040]  __alloc_pages_nodemask+0x2f0/0x340
> kern  :warn  : [   51.775044]  pte_alloc_one+0x13/0x40
> kern  :warn  : [   51.775048]  __handle_mm_fault+0xe9d/0xf70
> kern  :warn  : [   51.775050]  handle_mm_fault+0xdd/0x210
> kern  :warn  : [   51.775054]  __do_page_fault+0x2f1/0x520
> kern  :warn  : [   51.775056]  do_page_fault+0x30/0x120
> user  :notice: [   51.782517] 2019-11-06 23:56:11  truncate /tmp/vm-scalability-tmp/vm-scalability/sparse-lru-file-mmap-read-75 -s 22906492245
> 
> kern  :warn  : [   51.792048]  page_fault+0x3e/0x50
> kern  :warn  : [   51.792051] RIP: 0033:0x55c6ced07cfc
> user  :notice: [   51.798308] 2019-11-06 23:56:11  ./usemem --runtime 300 -f /tmp/vm-scalability-tmp/vm-scalability/sparse-lru-file-mmap-read-74 --readonly 22906492245
> 
> kern  :warn  : [   51.799413] Code: 00 00 e8 37 f6 ff ff 48 83 c4 08 c3 48 8d 3d 74 23 00 00 e8 56 f6 ff ff bf 01 00 00 00 e8 bc f6 ff ff 85 d2 74 08 48 8d 04 f7 <48> 8b 00 c3 48 8d 04 f7 48 89 30 b8 00 00 00 00 c3 48 89 f8 48 29
> kern  :warn  : [   51.799415] RSP: 002b:00007ffe889ebfe8 EFLAGS: 00010202
> user  :notice: [   51.808045] 2019-11-06 23:56:11  ./usemem --runtime 300 -f /tmp/vm-scalability-tmp/vm-scalability/sparse-lru-file-mmap-read-75 --readonly 22906492245
> 
> kern  :warn  : [   51.809437] RAX: 00007fd4c5400000 RBX: 00000000085cc600 RCX: 0000000000000018
> kern  :warn  : [   51.809438] RDX: 0000000000000001 RSI: 00000000085cc600 RDI: 00007fd48259d000
> kern  :warn  : [   51.809440] RBP: 00000000085cc600 R08: 000000005dc2ed1f R09: 00007ffe889ebfa0
> user  :notice: [   51.818030] 2019-11-06 23:56:11  truncate /tmp/vm-scalability-tmp/vm-scalability/sparse-lru-file-mmap-read-76 -s 22906492245
> 
> kern  :warn  : [   51.820780] R10: 00007ffe889ebfa0 R11: 0000000000000246 R12: 0000000042e63000
> kern  :warn  : [   51.820781] R13: 00007fd48259d000 R14: 00007ffe889ec08c R15: 0000000000000001
> kern  :warn  : [   51.820813] Mem-Info:
> user  :notice: [   51.829016] 2019-11-06 23:56:11  ./usemem --runtime 300 -f /tmp/vm-scalability-tmp/vm-scalability/sparse-lru-file-mmap-read-76 --readonly 22906492245
> 
> kern  :warn  : [   51.830751] active_anon:68712 inactive_anon:29360 isolated_anon:0
>                                active_file:497 inactive_file:48481807 isolated_file:32
>                                unevictable:259869 dirty:2 writeback:0 unstable:0
>                                slab_reclaimable:130937 slab_unreclaimable:70163
>                                mapped:48488420 shmem:30398 pagetables:97884 bounce:0
>                                free:169055 free_pcp:20966 free_cma:0
> user  :notice: [   51.838463] 2019-11-06 23:56:11  truncate /tmp/vm-scalability-tmp/vm-scalability/sparse-lru-file-mmap-read-77 -s 22906492245
> 
> kern  :warn  : [   51.840634] Node 0 active_anon:109476kB inactive_anon:1400kB active_file:76kB inactive_file:47988152kB unevictable:281836kB isolated(anon):0kB isolated(file):0kB mapped:47993516kB dirty:4kB writeback:0kB shmem:1512kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 0kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no
> 
> 
> To reproduce:
> 
>         git clone https://github.com/intel/lkp-tests.git
>         cd lkp-tests
>         bin/lkp install job.yaml  # job file is attached in this email
>         bin/lkp run     job.yaml
> 

---8<---
Subject: [RFC v1] memcg: make memcg lru reclaim dirty pages
From: Hillf Danton <hdanton@xxxxxxxx>

The memcg lru was added on the top of high work, with the target of
bypassing the soft limit reclaim by hooking into kswapd's logic.

Because the memcg high work is currently unable to reclaim dirty
pages, memcg lru adds the risk of premature oom even in case of
order-0 allocation, so being able to handle dirty pages is a
must-have.

To add that capability, memcg lru no longer goes the high work
route but embeds in kswapd's logic for page reclaim, by providing
reclaimer the victim memcg, and then kswapd will take care of the
rest.

The hook function mem_cgroup_reclaim_high() is split into two parts
for better round robin with an eye on over-reclaim.

Thanks to Rong Chen for testing.

Changes since v0
- fix build error
- split hook function into two parts

Reported-by: kernel test robot <rong.a.chen@xxxxxxxxx>
Reported-by: kbuild test robot <lkp@xxxxxxxxx>
Signed-off-by: Hillf Danton <hdanton@xxxxxxxx>
---

--- b/include/linux/memcontrol.h
+++ d/include/linux/memcontrol.h
@@ -742,7 +742,8 @@ static inline void mod_lruvec_page_state
 	local_irq_restore(flags);
 }
 
-void mem_cgroup_reclaim_high(void);
+struct mem_cgroup *mem_cgroup_reclaim_high_begin(void);
+void mem_cgroup_reclaim_high_end(struct mem_cgroup *memcg);
 
 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 						gfp_t gfp_mask,
@@ -1130,7 +1131,11 @@ static inline void __mod_lruvec_slab_sta
 	__mod_node_page_state(page_pgdat(page), idx, val);
 }
 
-static inline void mem_cgroup_reclaim_high(void)
+static inline struct mem_cgroup *mem_cgroup_reclaim_high_begin(void)
+{
+	return NULL;
+}
+static inline void mem_cgroup_reclaim_high_end(struct mem_cgroup *memcg)
 {
 }
 
--- b/mm/memcontrol.c
+++ d/mm/memcontrol.c
@@ -2362,12 +2362,34 @@ static struct mem_cgroup *memcg_pinch_lr
 	return NULL;
 }
 
-void mem_cgroup_reclaim_high(void)
+struct mem_cgroup *mem_cgroup_reclaim_high_begin(void)
 {
-	struct mem_cgroup *memcg = memcg_pinch_lru();
+	struct mem_cgroup *memcg, *victim;
 
-	if (memcg)
-		schedule_work(&memcg->high_work);
+	memcg = victim = memcg_pinch_lru();
+	if (!memcg)
+		return NULL;
+
+	while ((memcg = parent_mem_cgroup(memcg)))
+		if (page_counter_read(&memcg->memory) > memcg->high) {
+			memcg_memory_event(memcg, MEMCG_HIGH);
+			memcg_add_lru(memcg);
+			break;
+		}
+
+	return victim;
+}
+
+void mem_cgroup_reclaim_high_end(struct mem_cgroup *memcg)
+{
+	while (memcg) {
+		if (page_counter_read(&memcg->memory) > memcg->high) {
+			memcg_memory_event(memcg, MEMCG_HIGH);
+			memcg_add_lru(memcg);
+			return;
+		}
+		memcg = parent_mem_cgroup(memcg);
+	}
 }
 
 static void reclaim_high(struct mem_cgroup *memcg,
--- b/mm/vmscan.c
+++ d/mm/vmscan.c
@@ -2932,6 +2932,29 @@ static inline bool compaction_ready(stru
 	return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
 }
 
+#ifdef CONFIG_MEMCG
+static void mem_cgroup_reclaim_high(struct pglist_data *pgdat,
+					struct scan_control *sc)
+{
+	struct mem_cgroup *memcg;
+
+	memcg = mem_cgroup_reclaim_high_begin();
+	if (memcg) {
+		unsigned long ntr = sc->nr_to_reclaim;
+
+		sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
+		shrink_node_memcg(pgdat, memcg, sc);
+		sc->nr_to_reclaim = ntr;
+	}
+	mem_cgroup_reclaim_high_end(memcg);
+}
+#else
+static void mem_cgroup_reclaim_high(struct pglist_data *pgdat,
+					struct scan_control *sc)
+{
+}
+#endif
+
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2996,8 +3019,8 @@ static void shrink_zones(struct zonelist
 			if (zone->zone_pgdat == last_pgdat)
 				continue;
 
-			mem_cgroup_reclaim_high();
-				continue;
+			mem_cgroup_reclaim_high(zone->zone_pgdat, sc);
+			continue;
 
 			/*
 			 * This steals pages from memory cgroups over softlimit
@@ -3693,7 +3716,7 @@ restart:
 		if (sc.priority < DEF_PRIORITY - 2)
 			sc.may_writepage = 1;
 
-		mem_cgroup_reclaim_high();
+		mem_cgroup_reclaim_high(pgdat, &sc);
 		goto soft_limit_reclaim_end;
 
 		/* Call soft limit reclaim before calling shrink_node. */
--