Re: [patch] mm: skip rebalance of hopeless zones

Ying Han <yinghan@xxxxxxxxxx> · Thu, 9 Dec 2010 10:51:40 -0800

On Wed, Dec 8, 2010 at 7:16 AM, Johannes Weiner <hannes@xxxxxxxxxxx> wrote:
> Kswapd tries to rebalance zones persistently until their high
> watermarks are restored.
>
> If the amount of unreclaimable pages in a zone makes this impossible
> for reclaim, though, kswapd will end up in a busy loop without a
> chance of reaching its goal.
>
> This behaviour was observed on a virtual machine with a tiny
> Normal-zone that filled up with unreclaimable slab objects.
>
> This patch makes kswapd skip rebalancing on such 'hopeless' zones and
> leaves them to direct reclaim.
>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> ---
>  include/linux/mmzone.h |    2 ++
>  mm/page_alloc.c        |    4 ++--
>  mm/vmscan.c            |   36 ++++++++++++++++++++++++++++--------
>  3 files changed, 32 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 4890662..0cc1d63 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -655,6 +655,8 @@ typedef struct pglist_data {
>  extern struct mutex zonelists_mutex;
>  void build_all_zonelists(void *data);
>  void wakeup_kswapd(struct zone *zone, int order);
> +bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> +                        int classzone_idx, int alloc_flags, long free_pages);
>  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
>                int classzone_idx, int alloc_flags);
>  bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 1845a97..c7d2b28 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1458,8 +1458,8 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
>  * Return true if free pages are above 'mark'. This takes into account the order
>  * of the allocation.
>  */
> -static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> -                     int classzone_idx, int alloc_flags, long free_pages)
> +bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> +                        int classzone_idx, int alloc_flags, long free_pages)
>  {
>        /* free_pages my go negative - that's OK */
>        long min = mark;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 42a4859..5623f36 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2191,6 +2191,25 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
>  }
>  #endif
>
> +static bool zone_needs_scan(struct zone *zone, int order,
> +                           unsigned long goal, int classzone_idx)
> +{
> +       unsigned long free, prospect;
> +
> +       free = zone_page_state(zone, NR_FREE_PAGES);
> +       if (zone->percpu_drift_mark && free < zone->percpu_drift_mark)
> +               free = zone_page_state_snapshot(zone, NR_FREE_PAGES);
> +
> +       if (__zone_watermark_ok(zone, order, goal, classzone_idx, 0, free))
> +               return false;
> +       /*
> +        * Ensure that the watermark is at all restorable through
> +        * reclaim.  Otherwise, leave the zone to direct reclaim.
> +        */
> +       prospect = free + zone_reclaimable_pages(zone);
> +       return prospect >= goal;
> +}
> +
>  /* is kswapd sleeping prematurely? */
>  static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
>  {
> @@ -2210,8 +2229,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
>                if (zone->all_unreclaimable)
>                        continue;
>
> -               if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
> -                                                               0, 0))
> +               if (zone_needs_scan(zone, order, high_wmark_pages(zone), 0))
>                        return 1;
>        }
>
> @@ -2282,6 +2300,7 @@ loop_again:
>                 */
>                for (i = pgdat->nr_zones - 1; i >= 0; i--) {
>                        struct zone *zone = pgdat->node_zones + i;
> +                       unsigned long goal;
>
>                        if (!populated_zone(zone))
>                                continue;
> @@ -2297,8 +2316,8 @@ loop_again:
>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
>                                                        &sc, priority, 0);
>
> -                       if (!zone_watermark_ok_safe(zone, order,
> -                                       high_wmark_pages(zone), 0, 0)) {
> +                       goal = high_wmark_pages(zone);
> +                       if (zone_needs_scan(zone, order, goal, 0)) {
>                                end_zone = i;
>                                break;
>                        }
> @@ -2323,6 +2342,7 @@ loop_again:
>                 */
>                for (i = 0; i <= end_zone; i++) {
>                        struct zone *zone = pgdat->node_zones + i;
> +                       unsigned long goal;
>                        int nr_slab;
>
>                        if (!populated_zone(zone))
> @@ -2339,12 +2359,13 @@ loop_again:
>                         */
>                        mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
>
> +                       goal = high_wmark_pages(zone);
>                        /*
>                         * We put equal pressure on every zone, unless one
>                         * zone has way too many pages free already.
>                         */
>                        if (!zone_watermark_ok_safe(zone, order,
> -                                       8*high_wmark_pages(zone), end_zone, 0))
> +                                                   8 * goal, end_zone, 0))
>                                shrink_zone(priority, zone, &sc);
>                        reclaim_state->reclaimed_slab = 0;
>                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
> @@ -2373,8 +2394,7 @@ loop_again:
>                                compact_zone_order(zone, sc.order, sc.gfp_mask,
>                                                        false);
>
> -                       if (!zone_watermark_ok_safe(zone, order,
> -                                       high_wmark_pages(zone), end_zone, 0)) {
> +                       if (zone_needs_scan(zone, order, goal, end_zone)) {
>                                all_zones_ok = 0;
>                                /*
>                                 * We are still under min water mark.  This
> @@ -2587,7 +2607,7 @@ void wakeup_kswapd(struct zone *zone, int order)
>                pgdat->kswapd_max_order = order;
>        if (!waitqueue_active(&pgdat->kswapd_wait))
>                return;
> -       if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
> +       if (!zone_needs_scan(zone, order, low_wmark_pages(zone), 0))
>                return;
>
>        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);

So we look at zone_reclaimable_pages() only to determine proceed
reclaiming or not. What if I have tons of unused dentry and inode
caches and we are skipping the shrinker here?

--Ying

> 1.7.3.2
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href