Tetsuo Handa wrote: > From fb48bec5d08068bc68023f4684098d0ce9ab6439 Mon Sep 17 00:00:00 2001 > From: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> > Date: Thu, 10 Sep 2015 20:13:38 +0900 > Subject: [PATCH] mm/page_alloc: Favor kthread and dying threads over normal > threads The effect of this patch (which gives higher priority to kernel threads and dying threads) becomes clear if a different reproducer shown below ---------------------------------------- #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sched.h> static int file_writer(void *unused) { static char buffer[4096] = { }; const int fd = open("/tmp/file", O_WRONLY | O_CREAT | O_APPEND, 0600); sleep(2); while (write(fd, buffer, sizeof(buffer)) == sizeof(buffer)); return 0; } static int memory_consumer(void *unused) { const int fd = open("/dev/zero", O_RDONLY); unsigned long size; char *buf = NULL; sleep(3); unlink("/tmp/file"); for (size = 1048576; size < 512UL * (1 << 30); size <<= 1) { char *cp = realloc(buf, size); if (!cp) { size >>= 1; break; } buf = cp; } read(fd, buf, size); /* Will cause OOM due to overcommit */ return 0; } int main(int argc, char *argv[]) { int i; for (i = 0; i < 2; i++) clone(file_writer, malloc(4 * 1024) + 4 * 1024, CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, NULL); clone(memory_consumer, malloc(4 * 1024) + 4 * 1024, CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, NULL); pause(); return 0; } ---------------------------------------- is used with "GFP_NOFS can fail" patch shown below. ---------------------------------------- diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index a7a3a63..d21742c4 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -54,8 +54,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) return ptr; if (!(++retries % 100)) - xfs_err(NULL, + xfs_err(NULL, "%s(%u) " "possible memory allocation deadlock in %s (mode:0x%x)", + current->comm, current->pid, __func__, lflags); congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); @@ -119,8 +120,9 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) return ptr; if (!(++retries % 100)) - xfs_err(NULL, + xfs_err(NULL, "%s(%u) " "possible memory allocation deadlock in %s (mode:0x%x)", + current->comm, current->pid, __func__, lflags); congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8ecffb3..3ea4188 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -353,8 +353,9 @@ retry: * handle buffer allocation failures we can't do much. */ if (!(++retries % 100)) - xfs_err(NULL, + xfs_err(NULL, "%s(%u) " "possible memory allocation deadlock in %s (mode:0x%x)", + current->comm, current->pid, __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dcfe935..2c8873b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2680,6 +2680,9 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; + if (!(gfp_mask & __GFP_FS)) + return; + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || debug_guardpage_minorder() > 0) return; @@ -2764,12 +2767,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; /* The OOM killer does not compensate for IO-less reclaim */ if (!(gfp_mask & __GFP_FS)) { - /* - * XXX: Page reclaim didn't yield anything, - * and the OOM killer can't be invoked, but - * keep looping as per tradition. - */ - *did_some_progress = 1; goto out; } if (pm_suspended_storage()) ---------------------------------------- Without this patch, we can observe that workqueue for writeback operation got stuck at memory allocation (indicated by XFS's possible memory allocation deadlock warning) like what throttle_direct_reclaim() says. ---------------------------------------- [ 174.062364] systemd-journal invoked oom-killer: gfp_mask=0x280da, order=0, oom_score_adj=0 [ 174.064543] systemd-journal cpuset=/ mems_allowed=0 [ 174.066339] CPU: 2 PID: 470 Comm: systemd-journal Not tainted 4.2.0-next-20150909+ #110 [ 174.068416] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013 [ 174.070951] 0000000000000000 000000009a0f4f1a ffff880035de3af8 ffffffff8131bd76 [ 174.073060] ffff880035ce9980 ffff880035de3ba0 ffffffff81187d2d ffff880035de3b20 [ 174.075067] ffffffff8108fc93 ffff8800775d4ed0 ffff8800775d4c80 ffff8800775d4c80 [ 174.077365] Call Trace: [ 174.078381] [<ffffffff8131bd76>] dump_stack+0x4e/0x88 [ 174.079888] [<ffffffff81187d2d>] dump_header+0x82/0x232 [ 174.081422] [<ffffffff8108fc93>] ? preempt_count_add+0x43/0x90 [ 174.084411] [<ffffffff8108fc0d>] ? get_parent_ip+0xd/0x50 [ 174.086105] [<ffffffff8108fc93>] ? preempt_count_add+0x43/0x90 [ 174.087817] [<ffffffff8111b8bb>] oom_kill_process+0x35b/0x3c0 [ 174.089493] [<ffffffff810737d0>] ? has_ns_capability_noaudit+0x30/0x40 [ 174.091212] [<ffffffff810737f2>] ? has_capability_noaudit+0x12/0x20 [ 174.092926] [<ffffffff8111bb8d>] out_of_memory+0x21d/0x4a0 [ 174.094552] [<ffffffff81121184>] __alloc_pages_nodemask+0x904/0x930 [ 174.096426] [<ffffffff811643b0>] alloc_pages_vma+0xb0/0x1f0 [ 174.098244] [<ffffffff81144ed2>] handle_mm_fault+0x13f2/0x19d0 [ 174.100161] [<ffffffff81163397>] ? change_prot_numa+0x17/0x30 [ 174.101943] [<ffffffff81057912>] __do_page_fault+0x152/0x480 [ 174.103483] [<ffffffff81057c70>] do_page_fault+0x30/0x80 [ 174.104982] [<ffffffff816382e8>] page_fault+0x28/0x30 [ 174.106378] Mem-Info: [ 174.107285] active_anon:314047 inactive_anon:1920 isolated_anon:16 [ 174.107285] active_file:11066 inactive_file:87440 isolated_file:0 [ 174.107285] unevictable:0 dirty:5533 writeback:81919 unstable:0 [ 174.107285] slab_reclaimable:4102 slab_unreclaimable:4889 [ 174.107285] mapped:10081 shmem:2148 pagetables:1906 bounce:0 [ 174.107285] free:13078 free_pcp:30 free_cma:0 [ 174.116538] Node 0 DMA free:7312kB min:400kB low:500kB high:600kB active_anon:5204kB inactive_anon:144kB active_file:216kB inactive_file:976kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15988kB managed:15904kB mlocked:0kB dirty:32kB writeback:932kB mapped:296kB shmem:180kB slab_reclaimable:288kB slab_unreclaimable:300kB kernel_stack:240kB pagetables:396kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:7792 all_unreclaimable? yes [ 174.126674] lowmem_reserve[]: 0 1729 1729 1729 [ 174.129129] Node 0 DMA32 free:45000kB min:44652kB low:55812kB high:66976kB active_anon:1250984kB inactive_anon:7536kB active_file:44048kB inactive_file:348784kB unevictable:0kB isolated(anon):64kB isolated(file):0kB present:2080640kB managed:1774196kB mlocked:0kB dirty:22100kB writeback:326744kB mapped:40028kB shmem:8412kB slab_reclaimable:16120kB slab_unreclaimable:19256kB kernel_stack:3920kB pagetables:7228kB unstable:0kB bounce:0kB free_pcp:120kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no [ 174.141837] lowmem_reserve[]: 0 0 0 0 [ 174.143413] Node 0 DMA: 5*4kB (EM) 2*8kB (UE) 7*16kB (UEM) 3*32kB (UE) 3*64kB (UEM) 2*128kB (E) 2*256kB (UE) 2*512kB (UM) 3*1024kB (UEM) 1*2048kB (U) 0*4096kB = 7348kB [ 174.148343] Node 0 DMA32: 691*4kB (UE) 650*8kB (UEM) 242*16kB (UE) 30*32kB (UE) 6*64kB (UE) 3*128kB (U) 7*256kB (UEM) 6*512kB (UE) 24*1024kB (UEM) 1*2048kB (E) 0*4096kB = 45052kB [ 174.154113] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB [ 174.156382] 100695 total pagecache pages [ 174.157986] 0 pages in swap cache [ 174.159431] Swap cache stats: add 0, delete 0, find 0/0 [ 174.161316] Free swap = 0kB [ 174.162748] Total swap = 0kB [ 174.164874] 524157 pages RAM [ 174.166635] 0 pages HighMem/MovableOnly [ 174.168878] 76632 pages reserved [ 174.170472] 0 pages hwpoisoned [ 174.171788] [ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name [ 174.174162] [ 470] 0 470 34593 2894 31 3 0 0 systemd-journal [ 174.176546] [ 485] 0 485 10290 810 23 3 0 -1000 systemd-udevd [ 174.179056] [ 507] 0 507 12795 763 25 3 0 -1000 auditd [ 174.181386] [ 1688] 0 1688 82430 6883 83 3 0 0 firewalld [ 174.184146] [ 1691] 70 1691 6988 671 18 3 0 0 avahi-daemon [ 174.186614] [ 1694] 0 1694 54104 1701 40 3 0 0 rsyslogd [ 174.189930] [ 1695] 0 1695 137547 5615 88 3 0 0 tuned [ 174.192275] [ 1698] 0 1698 4823 678 15 3 0 0 irqbalance [ 174.194670] [ 1699] 0 1699 1095 358 8 3 0 0 rngd [ 174.196894] [ 1705] 0 1705 53609 2135 59 3 0 0 abrtd [ 174.199280] [ 1706] 0 1706 53001 1962 57 4 0 0 abrt-watch-log [ 174.202202] [ 1708] 0 1708 8673 726 23 3 0 0 systemd-logind [ 174.205167] [ 1709] 81 1709 6647 734 18 3 0 -900 dbus-daemon [ 174.207828] [ 1717] 0 1717 31578 802 20 3 0 0 crond [ 174.210248] [ 1756] 70 1756 6988 57 17 3 0 0 avahi-daemon [ 174.212817] [ 1900] 0 1900 46741 1920 43 3 0 0 vmtoolsd [ 174.215156] [ 2445] 0 2445 25938 3354 49 3 0 0 dhclient [ 174.217955] [ 2449] 999 2449 128626 3447 49 4 0 0 polkitd [ 174.220319] [ 2532] 0 2532 20626 1512 42 4 0 -1000 sshd [ 174.222694] [ 2661] 0 2661 7320 596 19 3 0 0 xinetd [ 174.224974] [ 4080] 0 4080 22770 1182 43 3 0 0 master [ 174.227266] [ 4247] 89 4247 22796 1533 46 3 0 0 pickup [ 174.229483] [ 4248] 89 4248 22813 1605 45 3 0 0 qmgr [ 174.231719] [ 4772] 0 4772 75242 1276 96 3 0 0 nmbd [ 174.234313] [ 4930] 0 4930 92960 3416 130 3 0 0 smbd [ 174.236671] [ 4967] 0 4967 92960 1516 125 3 0 0 smbd [ 174.239945] [ 5046] 0 5046 27503 571 12 3 0 0 agetty [ 174.242850] [11027] 0 11027 21787 1047 48 3 0 0 login [ 174.246033] [11030] 1000 11030 28865 904 14 3 0 0 bash [ 174.248385] [11108] 1000 11107 541750 295927 588 6 0 0 a.out [ 174.250806] Out of memory: Kill process 11109 (a.out) score 662 or sacrifice child [ 174.252879] Killed process 11108 (a.out) total-vm:2167000kB, anon-rss:1182716kB, file-rss:992kB [ 178.675269] XFS: crond(1717) possible memory allocation deadlock in xfs_buf_allocate_memory (mode:0x250) [ 178.729646] XFS: kworker/u16:29(382) possible memory allocation deadlock in xfs_buf_allocate_memory (mode:0x250) [ 180.805219] XFS: crond(1717) possible memory allocation deadlock in xfs_buf_allocate_memory (mode:0x250) [ 180.877987] XFS: kworker/u16:29(382) possible memory allocation deadlock in xfs_buf_allocate_memory (mode:0x250) [ 182.392209] XFS: vmtoolsd(1900) possible memory allocation deadlock in xfs_buf_allocate_memory (mode:0x250) [ 182.961922] XFS: crond(1717) possible memory allocation deadlock in xfs_buf_allocate_memory (mode:0x250) [ 183.050782] XFS: kworker/u16:29(382) possible memory allocation deadlock in xfs_buf_allocate_memory (mode:0x250) (...snipped...) [ 255.566378] kworker/u16:29 D ffff88007fc55b40 0 382 2 0x00000000 [ 255.568322] Workqueue: writeback wb_workfn (flush-8:0) [ 255.569891] ffff880077666fc8 0000000000000046 ffff880077646600 ffff880077668000 [ 255.572002] ffff880077667030 ffff88007fc4dfc0 00000000ffff501b 0000000000000040 [ 255.574117] ffff880077667010 ffffffff81631bf8 ffff88007fc4dfc0 ffff880077667090 [ 255.576197] Call Trace: [ 255.577225] [<ffffffff81631bf8>] schedule+0x38/0x90 [ 255.578825] [<ffffffff81635672>] schedule_timeout+0x122/0x1c0 [ 255.580629] [<ffffffff810c8020>] ? cascade+0x90/0x90 [ 255.582192] [<ffffffff81635769>] schedule_timeout_uninterruptible+0x19/0x20 [ 255.584132] [<ffffffff81120eb8>] __alloc_pages_nodemask+0x638/0x930 [ 255.585816] [<ffffffff8116310c>] alloc_pages_current+0x8c/0x100 [ 255.587608] [<ffffffff8127bf7a>] xfs_buf_allocate_memory+0x17b/0x26e [ 255.589529] [<ffffffff81246bca>] xfs_buf_get_map+0xca/0x130 [ 255.591139] [<ffffffff81247144>] xfs_buf_read_map+0x24/0xb0 [ 255.592828] [<ffffffff8126ec77>] xfs_trans_read_buf_map+0x97/0x1a0 [ 255.594633] [<ffffffff812223d3>] xfs_btree_read_buf_block.constprop.28+0x73/0xc0 [ 255.596745] [<ffffffff8122249b>] xfs_btree_lookup_get_block+0x7b/0xf0 [ 255.598527] [<ffffffff812223e9>] ? xfs_btree_read_buf_block.constprop.28+0x89/0xc0 [ 255.600567] [<ffffffff8122638e>] xfs_btree_lookup+0xbe/0x4a0 [ 255.602289] [<ffffffff8120d546>] xfs_alloc_lookup_eq+0x16/0x20 [ 255.604092] [<ffffffff8120da7d>] xfs_alloc_fixup_trees+0x23d/0x340 [ 255.605915] [<ffffffff812110cc>] ? xfs_allocbt_init_cursor+0x3c/0xc0 [ 255.607577] [<ffffffff8120f381>] xfs_alloc_ag_vextent_near+0x511/0x880 [ 255.609336] [<ffffffff8120fdb5>] xfs_alloc_ag_vextent+0xb5/0xe0 [ 255.611082] [<ffffffff81210866>] xfs_alloc_vextent+0x356/0x460 [ 255.613046] [<ffffffff8121e496>] xfs_bmap_btalloc+0x386/0x6d0 [ 255.614684] [<ffffffff8121e7e9>] xfs_bmap_alloc+0x9/0x10 [ 255.616322] [<ffffffff8121f1e9>] xfs_bmapi_write+0x4b9/0xa10 [ 255.617969] [<ffffffff8125280c>] xfs_iomap_write_allocate+0x13c/0x320 [ 255.619818] [<ffffffff812407ba>] xfs_map_blocks+0x15a/0x170 [ 255.621500] [<ffffffff8124177b>] xfs_vm_writepage+0x18b/0x5b0 [ 255.623066] [<ffffffff811228ce>] __writepage+0xe/0x30 [ 255.624593] [<ffffffff811232f3>] write_cache_pages+0x1f3/0x4a0 [ 255.626287] [<ffffffff811228c0>] ? mapping_tagged+0x10/0x10 [ 255.628265] [<ffffffff811235ec>] generic_writepages+0x4c/0x80 [ 255.630194] [<ffffffff8108fc0d>] ? get_parent_ip+0xd/0x50 [ 255.631883] [<ffffffff8108fc93>] ? preempt_count_add+0x43/0x90 [ 255.633464] [<ffffffff8124062e>] xfs_vm_writepages+0x3e/0x50 [ 255.635000] [<ffffffff81124199>] do_writepages+0x19/0x30 [ 255.636549] [<ffffffff811b3de3>] __writeback_single_inode+0x33/0x170 [ 255.638345] [<ffffffff81635fe5>] ? _raw_spin_unlock+0x15/0x40 [ 255.640005] [<ffffffff811b44a9>] writeback_sb_inodes+0x279/0x440 [ 255.641636] [<ffffffff811b46f1>] __writeback_inodes_wb+0x81/0xb0 [ 255.643310] [<ffffffff811b48cc>] wb_writeback+0x1ac/0x1e0 [ 255.644866] [<ffffffff811b4e45>] wb_workfn+0xe5/0x2f0 [ 255.646384] [<ffffffff8163606c>] ? _raw_spin_unlock_irq+0x1c/0x40 [ 255.648301] [<ffffffff8108bda9>] ? finish_task_switch+0x69/0x230 [ 255.649915] [<ffffffff81081a59>] process_one_work+0x129/0x300 [ 255.651479] [<ffffffff81081d45>] worker_thread+0x115/0x450 [ 255.653019] [<ffffffff81081c30>] ? process_one_work+0x300/0x300 [ 255.654664] [<ffffffff81087113>] kthread+0xd3/0xf0 [ 255.656060] [<ffffffff81087040>] ? kthread_create_on_node+0x1a0/0x1a0 [ 255.657736] [<ffffffff81636b1f>] ret_from_fork+0x3f/0x70 [ 255.659240] [<ffffffff81087040>] ? kthread_create_on_node+0x1a0/0x1a0 (...snipped...) [ 262.539668] Showing busy workqueues and worker pools: [ 262.540997] workqueue events: flags=0x0 [ 262.542153] pwq 4: cpus=2 node=0 flags=0x0 nice=0 active=5/256 [ 262.544104] pending: vmpressure_work_fn, e1000_watchdog [e1000], vmstat_update, vmw_fb_dirty_flush [vmwgfx], console_callback [ 262.547286] workqueue events_freezable: flags=0x4 [ 262.548604] pwq 4: cpus=2 node=0 flags=0x0 nice=0 active=1/256 [ 262.550398] pending: vmballoon_work [vmw_balloon] [ 262.552006] workqueue events_power_efficient: flags=0x80 [ 262.553542] pwq 4: cpus=2 node=0 flags=0x0 nice=0 active=1/256 [ 262.555281] pending: neigh_periodic_work [ 262.556624] workqueue events_freezable_power_: flags=0x84 [ 262.558168] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 [ 262.560241] in-flight: 214:disk_events_workfn [ 262.561837] workqueue writeback: flags=0x4e [ 262.563258] pwq 16: cpus=0-7 flags=0x4 nice=0 active=2/256 [ 262.564916] in-flight: 382:wb_workfn wb_workfn [ 262.566530] workqueue xfs-data/sda1: flags=0xc [ 262.567905] pwq 6: cpus=3 node=0 flags=0x0 nice=0 active=6/256 [ 262.569664] in-flight: 11065:xfs_end_io, 11066:xfs_end_io, 11026:xfs_end_io, 11068:xfs_end_io, 11064:xfs_end_io, 82:xfs_end_io [ 262.572704] pwq 4: cpus=2 node=0 flags=0x0 nice=0 active=17/256 [ 262.574497] in-flight: 447:xfs_end_io, 398(RESCUER):xfs_end_io xfs_end_io xfs_end_io xfs_end_io xfs_end_io xfs_end_io xfs_end_io xfs_end_io, 11071:xfs_end_io, 11072:xfs_end_io, 11069:xfs_end_io, 11090:xfs_end_io, 11073:xfs_end_io, 11091:xfs_end_io, 23:xfs_end_io, 11070:xfs_end_io [ 262.581400] pool 0: cpus=0 node=0 flags=0x0 nice=0 workers=4 idle: 11096 47 4 [ 262.583536] pool 4: cpus=2 node=0 flags=0x0 nice=0 workers=10 manager: 86 [ 262.585596] pool 6: cpus=3 node=0 flags=0x0 nice=0 workers=14 idle: 11063 11062 11061 11060 11059 30 84 11067 [ 262.588545] pool 16: cpus=0-7 flags=0x4 nice=0 workers=32 idle: 380 381 379 378 377 376 375 374 373 372 371 370 369 368 367 366 365 364 363 362 361 360 359 358 277 279 6 271 69 384 383 [ 263.463828] XFS: vmtoolsd(1900) possible memory allocation deadlock in xfs_buf_allocate_memory (mode:0x250) [ 264.167134] XFS: crond(1717) possible memory allocation deadlock in xfs_buf_allocate_memory (mode:0x250) [ 264.292440] XFS: pickup(4247) possible memory allocation deadlock in xfs_buf_allocate_memory (mode:0x250) [ 264.335779] XFS: kworker/u16:29(382) possible memory allocation deadlock in xfs_buf_allocate_memory (mode:0x250) ---------------------------------------- Complete log is at http://I-love.SAKURA.ne.jp/tmp/serial-20150911.txt.xz With this patch, as far as I tested, I didn't see the warning. Thus, I don't know whether ALLOC_HIGH is best, but I think that favoring kernel threads can help with making forward progress. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>