Under heavy memory pressure, the page allocator may call congestion_wait() to wait for IO congestion to clear or a timeout. This is not as sensible a choice as it first appears. There is no guarantee that BLK_RW_ASYNC is even congested as the pressure could have been due to a large number of SYNC reads and the allocator waits for the entire timeout, possibly uselessly. At the point of congestion_wait(), the allocator is struggling to get the pages it needs and it should back off. This patch puts the allocator to sleep on a zone->pressure_wq for either a timeout or until a direct reclaimer or kswapd brings the zone over the low watermark, whichever happens first. Signed-off-by: Mel Gorman <mel@xxxxxxxxx> --- include/linux/mmzone.h | 3 ++ mm/internal.h | 4 +++ mm/mmzone.c | 47 +++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 50 +++++++++++++++++++++++++++++++++++++++++++---- mm/vmscan.c | 2 + 5 files changed, 101 insertions(+), 5 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 30fe668..72465c1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -398,6 +398,9 @@ struct zone { unsigned long wait_table_hash_nr_entries; unsigned long wait_table_bits; + /* queue for processes waiting for pressure to relieve */ + wait_queue_head_t *pressure_wq; + /* * Discontig memory support fields. */ diff --git a/mm/internal.h b/mm/internal.h index 6a697bb..caa5bc8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -251,6 +251,10 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, #define ZONE_RECLAIM_SUCCESS 1 #endif +extern void check_zone_pressure(struct zone *zone); +extern long zonepressure_wait(struct zone *zone, unsigned int order, + long timeout); + extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; diff --git a/mm/mmzone.c b/mm/mmzone.c index f5b7d17..e80b89f 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/module.h> +#include <linux/sched.h> struct pglist_data *first_online_pgdat(void) { @@ -87,3 +88,49 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ + +void check_zone_pressure(struct zone *zone) +{ + /* If no process is waiting, nothing to do */ + if (!waitqueue_active(zone->pressure_wq)) + return; + + /* Check if the high watermark is ok for order 0 */ + if (zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) + wake_up_interruptible(zone->pressure_wq); +} + +/** + * zonepressure_wait - Wait for pressure on a zone to ease off + * @zone: The zone that is expected to be under pressure + * @order: The order the caller is waiting on pages for + * @timeout: Wait until pressure is relieved or this timeout is reached + * + * Waits for up to @timeout jiffies for pressure on a zone to be relieved. + * It's considered to be relieved if any direct reclaimer or kswapd brings + * the zone above the high watermark + */ +long zonepressure_wait(struct zone *zone, unsigned int order, long timeout) +{ + long ret; + DEFINE_WAIT(wait); + +wait_again: + prepare_to_wait(zone->pressure_wq, &wait, TASK_INTERRUPTIBLE); + + /* + * The use of io_schedule_timeout() here means that it gets + * accounted for as IO waiting. This may or may not be the case + * but at least this way it gets picked up by vmstat + */ + ret = io_schedule_timeout(timeout); + finish_wait(zone->pressure_wq, &wait); + + /* If woken early, check watermarks before continuing */ + if (ret && !zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) { + timeout = ret; + goto wait_again; + } + + return ret; +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8deb9d0..1383ff9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1734,8 +1734,10 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, preferred_zone, migratetype); - if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(BLK_RW_ASYNC, HZ/50); + if (!page && gfp_mask & __GFP_NOFAIL) { + /* If still failing, wait for pressure on zone to relieve */ + zonepressure_wait(preferred_zone, order, HZ/50); + } } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1905,8 +1907,8 @@ rebalance: /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { - /* Wait for some write requests to complete then retry */ - congestion_wait(BLK_RW_ASYNC, HZ/50); + /* Too much pressure, back off a bit at let reclaimers do work */ + zonepressure_wait(preferred_zone, order, HZ/50); goto rebalance; } @@ -3254,6 +3256,38 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) return 0; } +static noinline __init_refok +void zone_pressure_wq_cleanup(struct zone *zone) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + size_t free_size = sizeof(wait_queue_head_t); + + if (!slab_is_available()) + free_bootmem_node(pgdat, __pa(zone->pressure_wq), free_size); + else + kfree(zone->pressure_wq); +} + +static noinline __init_refok +int zone_pressure_wq_init(struct zone *zone) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + size_t alloc_size = sizeof(wait_queue_head_t); + + if (!slab_is_available()) + zone->pressure_wq = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, alloc_size); + else + zone->pressure_wq = kmalloc(alloc_size, GFP_KERNEL); + + if (!zone->pressure_wq) + return -ENOMEM; + + init_waitqueue_head(zone->pressure_wq); + + return 0; +} + static int __zone_pcp_update(void *data) { struct zone *zone = data; @@ -3306,9 +3340,15 @@ __meminit int init_currently_empty_zone(struct zone *zone, { struct pglist_data *pgdat = zone->zone_pgdat; int ret; - ret = zone_wait_table_init(zone, size); + + ret = zone_pressure_wq_init(zone); if (ret) return ret; + ret = zone_wait_table_init(zone, size); + if (ret) { + zone_pressure_wq_cleanup(zone); + return ret; + } pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; diff --git a/mm/vmscan.c b/mm/vmscan.c index c26986c..4f92a48 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1709,6 +1709,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist, } shrink_zone(priority, zone, sc); + check_zone_pressure(zone); } } @@ -2082,6 +2083,7 @@ loop_again: if (!zone_watermark_ok(zone, order, 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); + check_zone_pressure(zone); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); -- 1.6.5 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>