On Thu, Jan 11, 2024 at 10:33 AM Kairui Song <ryncsn@xxxxxxxxx> wrote: > > From: Kairui Song <kasong@xxxxxxxxxxx> > > When lru_gen is aging, it will update mm counters page by page, > which causes a higher overhead if age happens frequently or there > are a lot of pages in one generation getting moved. > Optimize this by doing the counter update in batch. > > Although most __mod_*_state has its own caches the overhead > is still observable. > > Tested in a 4G memcg on a EPYC 7K62 with: > > memcached -u nobody -m 16384 -s /tmp/memcached.socket \ > -a 0766 -t 16 -B binary & > > memtier_benchmark -S /tmp/memcached.socket \ > -P memcache_binary -n allkeys \ > --key-minimum=1 --key-maximum=16000000 -d 1024 \ > --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6 > > Average result of 18 test runs: > > Before: 44017.78 Ops/sec > After: 44687.08 Ops/sec (+1.5%) I see the same performance numbers get quoted in all the 3 patches. How much performance improvements does this particular patch provide (the same for the other 2 patches)? If as the cover letter says, the most performance benefits come from patch 3 (prefetching), can we just have that patch alone to avoid the extra complexities. > Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx> > --- > mm/vmscan.c | 64 +++++++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 55 insertions(+), 9 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 4f9c854ce6cc..185d53607c7e 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -3113,9 +3113,47 @@ static int folio_update_gen(struct folio *folio, int gen) > return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; > } > > +/* > + * Update LRU gen in batch for each lru_gen LRU list. The batch is limited to > + * each gen / type / zone level LRU. Batch is applied after finished or aborted > + * scanning one LRU list. > + */ > +struct gen_update_batch { > + int delta[MAX_NR_GENS]; > +}; > + > +static void lru_gen_update_batch(struct lruvec *lruvec, int type, int zone, > + struct gen_update_batch *batch) > +{ > + int gen; > + int promoted = 0; > + struct lru_gen_folio *lrugen = &lruvec->lrugen; > + enum lru_list lru = type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; > + > + for (gen = 0; gen < MAX_NR_GENS; gen++) { > + int delta = batch->delta[gen]; > + > + if (!delta) > + continue; > + > + WRITE_ONCE(lrugen->nr_pages[gen][type][zone], > + lrugen->nr_pages[gen][type][zone] + delta); > + > + if (lru_gen_is_active(lruvec, gen)) > + promoted += delta; > + } > + > + if (promoted) { > + __update_lru_size(lruvec, lru, zone, -promoted); > + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, promoted); > + } > +} > + > /* protect pages accessed multiple times through file descriptors */ > -static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) > +static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, > + bool reclaiming, struct gen_update_batch *batch) > { > + int delta = folio_nr_pages(folio); > int type = folio_is_file_lru(folio); > struct lru_gen_folio *lrugen = &lruvec->lrugen; > int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); > @@ -3138,7 +3176,8 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai > new_flags |= BIT(PG_reclaim); > } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); > > - lru_gen_update_size(lruvec, folio, old_gen, new_gen); > + batch->delta[old_gen] -= delta; > + batch->delta[new_gen] += delta; > > return new_gen; > } > @@ -3672,6 +3711,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) > { > int zone; > int remaining = MAX_LRU_BATCH; > + struct gen_update_batch batch = { }; Can this batch variable be moved away from the stack? We (Google) use a much larger value for MAX_NR_GENS internally. This large stack allocation from "struct gen_update_batch batch" can significantly increase the risk of stack overflow for our use cases. > struct lru_gen_folio *lrugen = &lruvec->lrugen; > int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); > > @@ -3690,12 +3730,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) > VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); > VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); > > - new_gen = folio_inc_gen(lruvec, folio, false); > + new_gen = folio_inc_gen(lruvec, folio, false, &batch); > list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); > > - if (!--remaining) > + if (!--remaining) { > + lru_gen_update_batch(lruvec, type, zone, &batch); > return false; > + } > } > + lru_gen_update_batch(lruvec, type, zone, &batch); > } > done: > reset_ctrl_pos(lruvec, type, true); > @@ -4215,7 +4258,7 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) > ******************************************************************************/ > > static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc, > - int tier_idx) > + int tier_idx, struct gen_update_batch *batch) > { > bool success; > int gen = folio_lru_gen(folio); > @@ -4257,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c > if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) { > int hist = lru_hist_from_seq(lrugen->min_seq[type]); > > - gen = folio_inc_gen(lruvec, folio, false); > + gen = folio_inc_gen(lruvec, folio, false, batch); > list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); > > WRITE_ONCE(lrugen->protected[hist][type][tier - 1], > @@ -4267,7 +4310,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c > > /* ineligible */ > if (zone > sc->reclaim_idx || skip_cma(folio, sc)) { > - gen = folio_inc_gen(lruvec, folio, false); > + gen = folio_inc_gen(lruvec, folio, false, batch); > list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); > return true; > } > @@ -4275,7 +4318,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c > /* waiting for writeback */ > if (folio_test_locked(folio) || folio_test_writeback(folio) || > (type == LRU_GEN_FILE && folio_test_dirty(folio))) { > - gen = folio_inc_gen(lruvec, folio, true); > + gen = folio_inc_gen(lruvec, folio, true, batch); > list_move(&folio->lru, &lrugen->folios[gen][type][zone]); > return true; > } > @@ -4341,6 +4384,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, > for (i = MAX_NR_ZONES; i > 0; i--) { > LIST_HEAD(moved); > int skipped_zone = 0; > + struct gen_update_batch batch = { }; > int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES; > struct list_head *head = &lrugen->folios[gen][type][zone]; > > @@ -4355,7 +4399,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, > > scanned += delta; > > - if (sort_folio(lruvec, folio, sc, tier)) > + if (sort_folio(lruvec, folio, sc, tier, &batch)) > sorted += delta; > else if (isolate_folio(lruvec, folio, sc)) { > list_add(&folio->lru, list); > @@ -4375,6 +4419,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, > skipped += skipped_zone; > } > > + lru_gen_update_batch(lruvec, type, zone, &batch); > + > if (!remaining || isolated >= MIN_LRU_BATCH) > break; > } > -- > 2.43.0 > >