Re: [PATCH v2 1/3] mm, lru_gen: batch update counters on againg

Wei Xu <weixugc@xxxxxxxxxx> · Fri, 12 Jan 2024 13:01:19 -0800

On Thu, Jan 11, 2024 at 10:33 AM Kairui Song <ryncsn@xxxxxxxxx> wrote:
>
> From: Kairui Song <kasong@xxxxxxxxxxx>
>
> When lru_gen is aging, it will update mm counters page by page,
> which causes a higher overhead if age happens frequently or there
> are a lot of pages in one generation getting moved.
> Optimize this by doing the counter update in batch.
>
> Although most __mod_*_state has its own caches the overhead
> is still observable.
>
> Tested in a 4G memcg on a EPYC 7K62 with:
>
>   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
>     -a 0766 -t 16 -B binary &
>
>   memtier_benchmark -S /tmp/memcached.socket \
>     -P memcache_binary -n allkeys \
>     --key-minimum=1 --key-maximum=16000000 -d 1024 \
>     --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6
>
> Average result of 18 test runs:
>
> Before: 44017.78 Ops/sec
> After:  44687.08 Ops/sec (+1.5%)

I see the same performance numbers get quoted in all the 3 patches.
How much performance improvements does this particular patch provide
(the same for the other 2 patches)? If as the cover letter says, the
most performance benefits come from patch 3 (prefetching), can we just
have that patch alone to avoid the extra complexities.

> Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
> ---
>  mm/vmscan.c | 64 +++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 55 insertions(+), 9 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 4f9c854ce6cc..185d53607c7e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3113,9 +3113,47 @@ static int folio_update_gen(struct folio *folio, int gen)
>         return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
>  }
>
> +/*
> + * Update LRU gen in batch for each lru_gen LRU list. The batch is limited to
> + * each gen / type / zone level LRU. Batch is applied after finished or aborted
> + * scanning one LRU list.
> + */
> +struct gen_update_batch {
> +       int delta[MAX_NR_GENS];
> +};
> +
> +static void lru_gen_update_batch(struct lruvec *lruvec, int type, int zone,
> +                                struct gen_update_batch *batch)
> +{
> +       int gen;
> +       int promoted = 0;
> +       struct lru_gen_folio *lrugen = &lruvec->lrugen;
> +       enum lru_list lru = type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
> +
> +       for (gen = 0; gen < MAX_NR_GENS; gen++) {
> +               int delta = batch->delta[gen];
> +
> +               if (!delta)
> +                       continue;
> +
> +               WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
> +                          lrugen->nr_pages[gen][type][zone] + delta);
> +
> +               if (lru_gen_is_active(lruvec, gen))
> +                       promoted += delta;
> +       }
> +
> +       if (promoted) {
> +               __update_lru_size(lruvec, lru, zone, -promoted);
> +               __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, promoted);
> +       }
> +}
> +
>  /* protect pages accessed multiple times through file descriptors */
> -static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
> +static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio,
> +                        bool reclaiming, struct gen_update_batch *batch)
>  {
> +       int delta = folio_nr_pages(folio);
>         int type = folio_is_file_lru(folio);
>         struct lru_gen_folio *lrugen = &lruvec->lrugen;
>         int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
> @@ -3138,7 +3176,8 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
>                         new_flags |= BIT(PG_reclaim);
>         } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
>
> -       lru_gen_update_size(lruvec, folio, old_gen, new_gen);
> +       batch->delta[old_gen] -= delta;
> +       batch->delta[new_gen] += delta;
>
>         return new_gen;
>  }
> @@ -3672,6 +3711,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
>  {
>         int zone;
>         int remaining = MAX_LRU_BATCH;
> +       struct gen_update_batch batch = { };

Can this batch variable be moved away from the stack?  We (Google) use
a much larger value for MAX_NR_GENS internally. This large stack
allocation from "struct gen_update_batch batch" can significantly
increase the risk of stack overflow for our use cases.

>         struct lru_gen_folio *lrugen = &lruvec->lrugen;
>         int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
>
> @@ -3690,12 +3730,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
>                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
>                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
>
> -                       new_gen = folio_inc_gen(lruvec, folio, false);
> +                       new_gen = folio_inc_gen(lruvec, folio, false, &batch);
>                         list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
>
> -                       if (!--remaining)
> +                       if (!--remaining) {
> +                               lru_gen_update_batch(lruvec, type, zone, &batch);
>                                 return false;
> +                       }
>                 }
> +               lru_gen_update_batch(lruvec, type, zone, &batch);
>         }
>  done:
>         reset_ctrl_pos(lruvec, type, true);
> @@ -4215,7 +4258,7 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
>   ******************************************************************************/
>
>  static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
> -                      int tier_idx)
> +                      int tier_idx, struct gen_update_batch *batch)
>  {
>         bool success;
>         int gen = folio_lru_gen(folio);
> @@ -4257,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
>         if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
>                 int hist = lru_hist_from_seq(lrugen->min_seq[type]);
>
> -               gen = folio_inc_gen(lruvec, folio, false);
> +               gen = folio_inc_gen(lruvec, folio, false, batch);
>                 list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
>
>                 WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
> @@ -4267,7 +4310,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
>
>         /* ineligible */
>         if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
> -               gen = folio_inc_gen(lruvec, folio, false);
> +               gen = folio_inc_gen(lruvec, folio, false, batch);
>                 list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
>                 return true;
>         }
> @@ -4275,7 +4318,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
>         /* waiting for writeback */
>         if (folio_test_locked(folio) || folio_test_writeback(folio) ||
>             (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
> -               gen = folio_inc_gen(lruvec, folio, true);
> +               gen = folio_inc_gen(lruvec, folio, true, batch);
>                 list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
>                 return true;
>         }
> @@ -4341,6 +4384,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>         for (i = MAX_NR_ZONES; i > 0; i--) {
>                 LIST_HEAD(moved);
>                 int skipped_zone = 0;
> +               struct gen_update_batch batch = { };
>                 int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
>                 struct list_head *head = &lrugen->folios[gen][type][zone];
>
> @@ -4355,7 +4399,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>
>                         scanned += delta;
>
> -                       if (sort_folio(lruvec, folio, sc, tier))
> +                       if (sort_folio(lruvec, folio, sc, tier, &batch))
>                                 sorted += delta;
>                         else if (isolate_folio(lruvec, folio, sc)) {
>                                 list_add(&folio->lru, list);
> @@ -4375,6 +4419,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>                         skipped += skipped_zone;
>                 }
>
> +               lru_gen_update_batch(lruvec, type, zone, &batch);
> +
>                 if (!remaining || isolated >= MIN_LRU_BATCH)
>                         break;
>         }
> --
> 2.43.0
>
>