If swap is backed by network storage such as NBD, there is a risk that a large number of reclaimers can hang the system by consuming all PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune min_free_kbytes in advance. This patch will throttle direct reclaimers if half the PF_MEMALLOC reserves are in use as the system is at risk of hanging. Signed-off-by: Mel Gorman <mgorman@xxxxxxx> --- include/linux/mmzone.h | 1 + mm/page_alloc.c | 1 + mm/vmscan.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 0 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e56f835..8e5c627 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -638,6 +638,7 @@ typedef struct pglist_data { range, including holes */ int node_id; wait_queue_head_t kswapd_wait; + wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; int kswapd_max_order; enum zone_type classzone_idx; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5ff1f71..eee10ec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4225,6 +4225,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); + init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat->kswapd_max_order = 0; pgdat_page_cgroup_init(pgdat); diff --git a/mm/vmscan.c b/mm/vmscan.c index b3a569f..7f1099f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2117,6 +2117,45 @@ out: return 0; } +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat, int high_zoneidx) +{ + struct zone *zone; + unsigned long pfmemalloc_reserve = 0; + unsigned long free_pages = 0; + int i; + + for (i = 0; i <= high_zoneidx; i++) { + zone = &pgdat->node_zones[i]; + pfmemalloc_reserve += min_wmark_pages(zone); + free_pages += zone_page_state(zone, NR_FREE_PAGES); + } + + return (free_pages > pfmemalloc_reserve / 2) ? true : false; +} + +/* + * Throttle direct reclaimers if backing storage is backed by the network + * and the PFMEMALLOC reserve for the preferred node is getting dangerously + * depleted. kswapd will continue to make progress and wake the processes + * when the low watermark is reached + */ +static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, + nodemask_t *nodemask) +{ + struct zone *zone; + int high_zoneidx = gfp_zone(gfp_mask); + DEFINE_WAIT(wait); + + /* Check if the pfmemalloc reserves are ok */ + first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); + if (pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx)) + return; + + /* Throttle */ + wait_event_interruptible(zone->zone_pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx)); +} + unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { @@ -2133,6 +2172,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .nodemask = nodemask, }; + throttle_direct_reclaim(gfp_mask, zonelist, nodemask); + + /* + * Do not enter reclaim if fatal signal is pending. 1 is returned so + * that the page allocator does not consider triggering OOM + */ + if (fatal_signal_pending(current)) + return 1; + trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, gfp_mask); @@ -2488,6 +2536,12 @@ loop_again: } } + + /* Wake throttled direct reclaimers if low watermark is met */ + if (waitqueue_active(&pgdat->pfmemalloc_wait) && + pfmemalloc_watermark_ok(pgdat, MAX_NR_ZONES - 1)) + wake_up_interruptible(&pgdat->pfmemalloc_wait); + if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) break; /* kswapd: all done */ /* -- 1.7.3.4 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>