Re: [PATCH 3/3] mm: vmscan: enforce inactive:active ratio at the reclaim root

Suren Baghdasaryan <surenb@xxxxxxxxxx> · Sun, 10 Nov 2019 18:15:50 -0800

On Thu, Nov 7, 2019 at 12:53 PM Johannes Weiner <hannes@xxxxxxxxxxx> wrote:
>
> We split the LRU lists into inactive and an active parts to maximize
> workingset protection while allowing just enough inactive cache space
> to faciltate readahead and writeback for one-off file accesses (e.g. a
> linear scan through a file, or logging); or just enough inactive anon
> to maintain recent reference information when reclaim needs to swap.
>
> With cgroups and their nested LRU lists, we currently don't do this
> correctly. While recursive cgroup reclaim establishes a relative LRU
> order among the pages of all involved cgroups, inactive:active size
> decisions are done on a per-cgroup level. As a result, we'll reclaim a
> cgroup's workingset when it doesn't have cold pages, even when one of
> its siblings has plenty of it that should be reclaimed first.
>
> For example: workload A has 50M worth of hot cache but doesn't do any
> one-off file accesses; meanwhile, parallel workload B scans files and
> rarely accesses the same page twice.
>
> If these workloads were to run in an uncgrouped system, A would be
> protected from the high rate of cache faults from B. But if they were
> put in parallel cgroups for memory accounting purposes, B's fast cache
> fault rate would push out the hot cache pages of A. This is unexpected
> and undesirable - the "scan resistance" of the page cache is broken.
>
> This patch moves inactive:active size balancing decisions to the root
> of reclaim - the same level where the LRU order is established.
>
> It does this by looking at the recursize

recursive?

> size of the inactive and the
> active file sets of the cgroup subtree at the beginning of the reclaim
> cycle, and then making a decision - scan or skip active pages - that
> applies throughout the entire run and to every cgroup involved.
>
> With that in place, in the test above, the VM will recognize that
> there are plenty of inactive pages in the combined cache set of
> workloads A and B and prefer the one-off cache in B over the hot pages
> in A. The scan resistance of the cache is restored.
>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> ---
>  include/linux/mmzone.h |   4 +-
>  mm/vmscan.c            | 185 ++++++++++++++++++++++++++---------------
>  2 files changed, 118 insertions(+), 71 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 7a09087e8c77..454a230ad417 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -229,12 +229,12 @@ enum lru_list {
>
>  #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
>
> -static inline int is_file_lru(enum lru_list lru)
> +static inline bool is_file_lru(enum lru_list lru)
>  {
>         return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
>  }
>
> -static inline int is_active_lru(enum lru_list lru)
> +static inline bool is_active_lru(enum lru_list lru)
>  {
>         return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
>  }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 527617ee9b73..df859b1d583c 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -79,6 +79,13 @@ struct scan_control {
>          */
>         struct mem_cgroup *target_mem_cgroup;
>
> +       /* Can active pages be deactivated as part of reclaim? */
> +#define DEACTIVATE_ANON 1
> +#define DEACTIVATE_FILE 2
> +       unsigned int may_deactivate:2;
> +       unsigned int force_deactivate:1;
> +       unsigned int skipped_deactivate:1;
> +
>         /* Writepage batching in laptop mode; RECLAIM_WRITE */
>         unsigned int may_writepage:1;
>
> @@ -101,6 +108,9 @@ struct scan_control {
>         /* One of the zones is ready for compaction */
>         unsigned int compaction_ready:1;
>
> +       /* There is easily reclaimable cold cache in the current node */
> +       unsigned int cache_trim_mode:1;
> +
>         /* The file pages on the current node are dangerously low */
>         unsigned int file_is_tiny:1;
>
> @@ -2154,6 +2164,20 @@ unsigned long reclaim_pages(struct list_head *page_list)
>         return nr_reclaimed;
>  }
>
> +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
> +                                struct lruvec *lruvec, struct scan_control *sc)
> +{
> +       if (is_active_lru(lru)) {
> +               if (sc->may_deactivate & (1 << is_file_lru(lru)))
> +                       shrink_active_list(nr_to_scan, lruvec, sc, lru);
> +               else
> +                       sc->skipped_deactivate = 1;
> +               return 0;
> +       }
> +
> +       return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
> +}
> +
>  /*
>   * The inactive anon list should be small enough that the VM never has
>   * to do too much work.
> @@ -2182,59 +2206,25 @@ unsigned long reclaim_pages(struct list_head *page_list)
>   *    1TB     101        10GB
>   *   10TB     320        32GB
>   */
> -static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
> -                                struct scan_control *sc, bool trace)
> +static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
>  {
> -       enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
> -       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> -       enum lru_list inactive_lru = file * LRU_FILE;
> +       enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
>         unsigned long inactive, active;
>         unsigned long inactive_ratio;
> -       struct lruvec *target_lruvec;
> -       unsigned long refaults;
>         unsigned long gb;
>
> -       inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
> -       active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
> +       inactive = lruvec_page_state(lruvec, inactive_lru);
> +       active = lruvec_page_state(lruvec, active_lru);
>
> -       /*
> -        * When refaults are being observed, it means a new workingset
> -        * is being established. Disable active list protection to get
> -        * rid of the stale workingset quickly.
> -        */
> -       target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
> -       refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE);
> -       if (file && target_lruvec->refaults != refaults) {
> -               inactive_ratio = 0;
> -       } else {
> -               gb = (inactive + active) >> (30 - PAGE_SHIFT);
> -               if (gb)
> -                       inactive_ratio = int_sqrt(10 * gb);
> -               else
> -                       inactive_ratio = 1;
> -       }
> -
> -       if (trace)
> -               trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
> -                       lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
> -                       lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
> -                       inactive_ratio, file);
> +       gb = (inactive + active) >> (30 - PAGE_SHIFT);
> +       if (gb)
> +               inactive_ratio = int_sqrt(10 * gb);
> +       else
> +               inactive_ratio = 1;
>
>         return inactive * inactive_ratio < active;
>  }
>
> -static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
> -                                struct lruvec *lruvec, struct scan_control *sc)
> -{
> -       if (is_active_lru(lru)) {
> -               if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
> -                       shrink_active_list(nr_to_scan, lruvec, sc, lru);
> -               return 0;
> -       }
> -
> -       return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
> -}
> -
>  enum scan_balance {
>         SCAN_EQUAL,
>         SCAN_FRACT,
> @@ -2296,28 +2286,17 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>
>         /*
>          * If the system is almost out of file pages, force-scan anon.
> -        * But only if there are enough inactive anonymous pages on
> -        * the LRU. Otherwise, the small LRU gets thrashed.
>          */
> -       if (sc->file_is_tiny &&
> -           !inactive_list_is_low(lruvec, false, sc, false) &&
> -           lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
> -                           sc->reclaim_idx) >> sc->priority) {
> +       if (sc->file_is_tiny) {
>                 scan_balance = SCAN_ANON;
>                 goto out;
>         }
>
>         /*
> -        * If there is enough inactive page cache, i.e. if the size of the
> -        * inactive list is greater than that of the active list *and* the
> -        * inactive list actually has some pages to scan on this priority, we
> -        * do not reclaim anything from the anonymous working set right now.
> -        * Without the second condition we could end up never scanning an
> -        * lruvec even if it has plenty of old anonymous pages unless the
> -        * system is under heavy pressure.
> +        * If there is enough inactive page cache, we do not reclaim
> +        * anything from the anonymous working right now.
>          */
> -       if (!inactive_list_is_low(lruvec, true, sc, false) &&
> -           lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
> +       if (sc->cache_trim_mode) {
>                 scan_balance = SCAN_FILE;
>                 goto out;
>         }
> @@ -2582,7 +2561,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
>          * Even if we did not try to evict anon pages at all, we want to
>          * rebalance the anon lru active/inactive ratio.
>          */
> -       if (total_swap_pages && inactive_list_is_low(lruvec, false, sc, true))
> +       if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
>                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
>                                    sc, LRU_ACTIVE_ANON);
>  }
> @@ -2722,6 +2701,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>         unsigned long nr_reclaimed, nr_scanned;
>         struct lruvec *target_lruvec;
>         bool reclaimable = false;
> +       unsigned long file;
>
>         target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
>
> @@ -2731,6 +2711,44 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>         nr_reclaimed = sc->nr_reclaimed;
>         nr_scanned = sc->nr_scanned;
>
> +       /*
> +        * Target desirable inactive:active list ratios for the anon
> +        * and file LRU lists.
> +        */
> +       if (!sc->force_deactivate) {
> +               unsigned long refaults;
> +
> +               if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
> +                       sc->may_deactivate |= DEACTIVATE_ANON;
> +               else
> +                       sc->may_deactivate &= ~DEACTIVATE_ANON;
> +
> +               /*
> +                * When refaults are being observed, it means a new
> +                * workingset is being established. Deactivate to get
> +                * rid of any stale active pages quickly.
> +                */
> +               refaults = lruvec_page_state(target_lruvec,
> +                                            WORKINGSET_ACTIVATE);
> +               if (refaults != target_lruvec->refaults ||
> +                   inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
> +                       sc->may_deactivate |= DEACTIVATE_FILE;
> +               else
> +                       sc->may_deactivate &= ~DEACTIVATE_FILE;
> +       } else
> +               sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
> +
> +       /*
> +        * If we have plenty of inactive file pages that aren't
> +        * thrashing, try to reclaim those first before touching
> +        * anonymous pages.
> +        */
> +       file = lruvec_page_state(target_lruvec, LRU_INACTIVE_FILE);
> +       if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
> +               sc->cache_trim_mode = 1;
> +       else
> +               sc->cache_trim_mode = 0;
> +
>         /*
>          * Prevent the reclaimer from falling into the cache trap: as
>          * cache pages start out inactive, every cache fault will tip
> @@ -2741,10 +2759,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>          * anon pages.  Try to detect this based on file LRU size.
>          */
>         if (!cgroup_reclaim(sc)) {
> -               unsigned long file;
> -               unsigned long free;
> -               int z;
>                 unsigned long total_high_wmark = 0;
> +               unsigned long free, anon;
> +               int z;
>
>                 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
>                 file = node_page_state(pgdat, NR_ACTIVE_FILE) +
> @@ -2758,7 +2775,17 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>                         total_high_wmark += high_wmark_pages(zone);
>                 }
>
> -               sc->file_is_tiny = file + free <= total_high_wmark;
> +               /*
> +                * Consider anon: if that's low too, this isn't a
> +                * runaway file reclaim problem, but rather just
> +                * extreme pressure. Reclaim as per usual then.
> +                */
> +               anon = node_page_state(pgdat, NR_INACTIVE_ANON);
> +
> +               sc->file_is_tiny =
> +                       file + free <= total_high_wmark &&
> +                       !(sc->may_deactivate & DEACTIVATE_ANON) &&
> +                       anon >> sc->priority;

The name of file_is_tiny flag seems to not correspond with its actual
semantics anymore. Maybe rename it into "skip_file"?

I'm confused about why !(sc->may_deactivate & DEACTIVATE_ANON) should
be a prerequisite for skipping file LRU reclaim. IIUC this means we
will skip reclaiming from file LRU only when anonymous page
deactivation is not allowed. Could you please add a comment explaining
this?

>         }
>
>         shrink_node_memcgs(pgdat, sc);
> @@ -3062,9 +3089,27 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
>         if (sc->compaction_ready)
>                 return 1;
>
> +       /*
> +        * We make inactive:active ratio decisions based on the node's
> +        * composition of memory, but a restrictive reclaim_idx or a
> +        * memory.low cgroup setting can exempt large amounts of
> +        * memory from reclaim. Neither of which are very common, so
> +        * instead of doing costly eligibility calculations of the
> +        * entire cgroup subtree up front, we assume the estimates are
> +        * good, and retry with forcible deactivation if that fails.
> +        */
> +       if (sc->skipped_deactivate) {
> +               sc->priority = initial_priority;
> +               sc->force_deactivate = 1;
> +               sc->skipped_deactivate = 0;
> +               goto retry;
> +       }
> +
>         /* Untapped cgroup reserves?  Don't OOM, retry. */
>         if (sc->memcg_low_skipped) {
>                 sc->priority = initial_priority;
> +               sc->force_deactivate = 0;
> +               sc->skipped_deactivate = 0;
>                 sc->memcg_low_reclaim = 1;
>                 sc->memcg_low_skipped = 0;
>                 goto retry;
> @@ -3347,18 +3392,20 @@ static void age_active_anon(struct pglist_data *pgdat,
>                                 struct scan_control *sc)
>  {
>         struct mem_cgroup *memcg;
> +       struct lruvec *lruvec;
>
>         if (!total_swap_pages)
>                 return;
>
> +       lruvec = mem_cgroup_lruvec(NULL, pgdat);
> +       if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
> +               return;
> +
>         memcg = mem_cgroup_iter(NULL, NULL, NULL);
>         do {
> -               struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
> -
> -               if (inactive_list_is_low(lruvec, false, sc, true))
> -                       shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> -                                          sc, LRU_ACTIVE_ANON);
> -
> +               lruvec = mem_cgroup_lruvec(memcg, pgdat);
> +               shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> +                                  sc, LRU_ACTIVE_ANON);
>                 memcg = mem_cgroup_iter(NULL, memcg, NULL);
>         } while (memcg);
>  }
> --
> 2.24.0
>