On Wed, Dec 8, 2010 at 7:16 AM, Johannes Weiner <hannes@xxxxxxxxxxx> wrote: > Kswapd tries to rebalance zones persistently until their high > watermarks are restored. > > If the amount of unreclaimable pages in a zone makes this impossible > for reclaim, though, kswapd will end up in a busy loop without a > chance of reaching its goal. > > This behaviour was observed on a virtual machine with a tiny > Normal-zone that filled up with unreclaimable slab objects. > > This patch makes kswapd skip rebalancing on such 'hopeless' zones and > leaves them to direct reclaim. > > Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> > --- > include/linux/mmzone.h | 2 ++ > mm/page_alloc.c | 4 ++-- > mm/vmscan.c | 36 ++++++++++++++++++++++++++++-------- > 3 files changed, 32 insertions(+), 10 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 4890662..0cc1d63 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -655,6 +655,8 @@ typedef struct pglist_data { > extern struct mutex zonelists_mutex; > void build_all_zonelists(void *data); > void wakeup_kswapd(struct zone *zone, int order); > +bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, > + int classzone_idx, int alloc_flags, long free_pages); > bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, > int classzone_idx, int alloc_flags); > bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 1845a97..c7d2b28 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -1458,8 +1458,8 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) > * Return true if free pages are above 'mark'. This takes into account the order > * of the allocation. > */ > -static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, > - int classzone_idx, int alloc_flags, long free_pages) > +bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, > + int classzone_idx, int alloc_flags, long free_pages) > { > /* free_pages my go negative - that's OK */ > long min = mark; > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 42a4859..5623f36 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -2191,6 +2191,25 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, > } > #endif > > +static bool zone_needs_scan(struct zone *zone, int order, > + unsigned long goal, int classzone_idx) > +{ > + unsigned long free, prospect; > + > + free = zone_page_state(zone, NR_FREE_PAGES); > + if (zone->percpu_drift_mark && free < zone->percpu_drift_mark) > + free = zone_page_state_snapshot(zone, NR_FREE_PAGES); > + > + if (__zone_watermark_ok(zone, order, goal, classzone_idx, 0, free)) > + return false; > + /* > + * Ensure that the watermark is at all restorable through > + * reclaim. Otherwise, leave the zone to direct reclaim. > + */ > + prospect = free + zone_reclaimable_pages(zone); > + return prospect >= goal; > +} > + > /* is kswapd sleeping prematurely? */ > static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) > { > @@ -2210,8 +2229,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) > if (zone->all_unreclaimable) > continue; > > - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), > - 0, 0)) > + if (zone_needs_scan(zone, order, high_wmark_pages(zone), 0)) > return 1; > } > > @@ -2282,6 +2300,7 @@ loop_again: > */ > for (i = pgdat->nr_zones - 1; i >= 0; i--) { > struct zone *zone = pgdat->node_zones + i; > + unsigned long goal; > > if (!populated_zone(zone)) > continue; > @@ -2297,8 +2316,8 @@ loop_again: > shrink_active_list(SWAP_CLUSTER_MAX, zone, > &sc, priority, 0); > > - if (!zone_watermark_ok_safe(zone, order, > - high_wmark_pages(zone), 0, 0)) { > + goal = high_wmark_pages(zone); > + if (zone_needs_scan(zone, order, goal, 0)) { > end_zone = i; > break; > } > @@ -2323,6 +2342,7 @@ loop_again: > */ > for (i = 0; i <= end_zone; i++) { > struct zone *zone = pgdat->node_zones + i; > + unsigned long goal; > int nr_slab; > > if (!populated_zone(zone)) > @@ -2339,12 +2359,13 @@ loop_again: > */ > mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); > > + goal = high_wmark_pages(zone); > /* > * We put equal pressure on every zone, unless one > * zone has way too many pages free already. > */ > if (!zone_watermark_ok_safe(zone, order, > - 8*high_wmark_pages(zone), end_zone, 0)) > + 8 * goal, end_zone, 0)) > shrink_zone(priority, zone, &sc); > reclaim_state->reclaimed_slab = 0; > nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, > @@ -2373,8 +2394,7 @@ loop_again: > compact_zone_order(zone, sc.order, sc.gfp_mask, > false); > > - if (!zone_watermark_ok_safe(zone, order, > - high_wmark_pages(zone), end_zone, 0)) { > + if (zone_needs_scan(zone, order, goal, end_zone)) { > all_zones_ok = 0; > /* > * We are still under min water mark. This > @@ -2587,7 +2607,7 @@ void wakeup_kswapd(struct zone *zone, int order) > pgdat->kswapd_max_order = order; > if (!waitqueue_active(&pgdat->kswapd_wait)) > return; > - if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) > + if (!zone_needs_scan(zone, order, low_wmark_pages(zone), 0)) > return; > > trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); So we look at zone_reclaimable_pages() only to determine proceed reclaiming or not. What if I have tons of unused dentry and inode caches and we are skipping the shrinker here? --Ying > 1.7.3.2 > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxxx For more info on Linux MM, > see: http://www.linux-mm.org/ . > Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/ > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/ Don't email: <a href