The patch titled Subject: mm, page_alloc: wakeup kcompactd even if kswapd cannot free more memory has been added to the -mm tree. Its filename is mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory.patch and later at http://ozlabs.org/~akpm/mmotm/broken-out/mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: David Rientjes <rientjes@xxxxxxxxxx> Subject: mm, page_alloc: wakeup kcompactd even if kswapd cannot free more memory Kswapd will not wakeup if per-zone watermarks are not failing or if too many previous attempts at background reclaim have failed. This can be true if there is a lot of free memory available. For high- order allocations, kswapd is responsible for waking up kcompactd for background compaction. If the zone is now below its watermarks or reclaim has recently failed (lots of free memory, nothing left to reclaim), kcompactd does not get woken up. When __GFP_DIRECT_RECLAIM is not allowed, allow kcompactd to still be woken up even if kswapd will not reclaim. This allows high-order allocations, such as thp, to still trigger background compaction even when the zone has an abundance of free memory. Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1803111659420.209721@xxxxxxxxxxxxxxxxxxxxxxxxx Signed-off-by: David Rientjes <rientjes@xxxxxxxxxx> Cc: Vlastimil Babka <vbabka@xxxxxxx> Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- Documentation/trace/postprocess/trace-vmscan-postprocess.pl | 4 - include/linux/mmzone.h | 3 include/trace/events/vmscan.h | 17 +++-- mm/page_alloc.c | 14 ++-- mm/vmscan.c | 32 +++++++--- 5 files changed, 45 insertions(+), 25 deletions(-) diff -puN Documentation/trace/postprocess/trace-vmscan-postprocess.pl~mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory Documentation/trace/postprocess/trace-vmscan-postprocess.pl --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl~mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory +++ a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -111,7 +111,7 @@ my $regex_direct_begin_default = 'order= my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)'; my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; -my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)'; +my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)'; my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; @@ -201,7 +201,7 @@ $regex_kswapd_sleep = generate_traceeven $regex_wakeup_kswapd = generate_traceevent_regex( "vmscan/mm_vmscan_wakeup_kswapd", $regex_wakeup_kswapd_default, - "nid", "zid", "order"); + "nid", "zid", "order", "gfp_flags"); $regex_lru_isolate = generate_traceevent_regex( "vmscan/mm_vmscan_lru_isolate", $regex_lru_isolate_default, diff -puN include/linux/mmzone.h~mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory include/linux/mmzone.h --- a/include/linux/mmzone.h~mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory +++ a/include/linux/mmzone.h @@ -777,7 +777,8 @@ static inline bool is_dev_zone(const str #include <linux/memory_hotplug.h> void build_all_zonelists(pg_data_t *pgdat); -void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); +void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, + enum zone_type classzone_idx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags, long free_pages); diff -puN include/trace/events/vmscan.h~mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory include/trace/events/vmscan.h --- a/include/trace/events/vmscan.h~mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory +++ a/include/trace/events/vmscan.h @@ -78,26 +78,29 @@ TRACE_EVENT(mm_vmscan_kswapd_wake, TRACE_EVENT(mm_vmscan_wakeup_kswapd, - TP_PROTO(int nid, int zid, int order), + TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags), - TP_ARGS(nid, zid, order), + TP_ARGS(nid, zid, order, gfp_flags), TP_STRUCT__entry( - __field( int, nid ) - __field( int, zid ) - __field( int, order ) + __field( int, nid ) + __field( int, zid ) + __field( int, order ) + __field( gfp_t, gfp_flags ) ), TP_fast_assign( __entry->nid = nid; __entry->zid = zid; __entry->order = order; + __entry->gfp_flags = gfp_flags; ), - TP_printk("nid=%d zid=%d order=%d", + TP_printk("nid=%d zid=%d order=%d gfp_flags=%s", __entry->nid, __entry->zid, - __entry->order) + __entry->order, + show_gfp_flags(__entry->gfp_flags)) ); DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, diff -puN mm/page_alloc.c~mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory mm/page_alloc.c --- a/mm/page_alloc.c~mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory +++ a/mm/page_alloc.c @@ -3797,16 +3797,18 @@ retry: return page; } -static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) +static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, + const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; + enum zone_type high_zoneidx = ac->high_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->high_zoneidx, ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, order, ac->high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -4085,7 +4087,7 @@ retry_cpuset: goto nopage; if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); + wake_all_kswapds(order, gfp_mask, ac); /* * The adjusted alloc_flags might result in immediate success, so try @@ -4143,7 +4145,7 @@ retry_cpuset: retry: /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); + wake_all_kswapds(order, gfp_mask, ac); reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) diff -puN mm/vmscan.c~mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory mm/vmscan.c --- a/mm/vmscan.c~mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory +++ a/mm/vmscan.c @@ -3538,16 +3538,21 @@ kswapd_try_sleep: } /* - * A zone is low on free memory, so wake its kswapd task to service it. + * A zone is low on free memory or too fragmented for high-order memory. If + * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's + * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim + * has failed or is not needed, still wake up kcompactd if only compaction is + * needed. */ -void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) +void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, + enum zone_type classzone_idx) { pg_data_t *pgdat; if (!managed_zone(zone)) return; - if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) + if (!cpuset_zone_allowed(zone, gfp_flags)) return; pgdat = zone->zone_pgdat; pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, @@ -3556,14 +3561,23 @@ void wakeup_kswapd(struct zone *zone, in if (!waitqueue_active(&pgdat->kswapd_wait)) return; - /* Hopeless node, leave it to direct reclaim */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + /* Hopeless node, leave it to direct reclaim if possible */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || + pgdat_balanced(pgdat, order, classzone_idx)) { + /* + * There may be plenty of free memory available, but it's too + * fragmented for high-order allocations. Wake up kcompactd + * and rely on compaction_suitable() to determine if it's + * needed. If it fails, it will defer subsequent attempts to + * ratelimit its work. + */ + if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) + wakeup_kcompactd(pgdat, order, classzone_idx); return; + } - if (pgdat_balanced(pgdat, order, classzone_idx)) - return; - - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order, + gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } _ Patches currently in -mm which might be from rientjes@xxxxxxxxxx are mm-page_alloc-extend-kernelcore-and-movablecore-for-percent.patch mm-page_alloc-extend-kernelcore-and-movablecore-for-percent-fix.patch mm-page_alloc-move-mirrored_kernelcore-to-__meminitdata.patch mm-compaction-drain-pcps-for-zone-when-kcompactd-fails.patch mm-page_alloc-wakeup-kcompactd-even-if-kswapd-cannot-free-more-memory.patch mm-oom-remove-3%-bonus-for-cap_sys_admin-processes.patch -- To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html