On Mon, Jul 29, 2024 at 9:52 AM T.J. Mercier <tjmercier@xxxxxxxxxx> wrote: > > From: Yu Zhao <yuzhao@xxxxxxxxxx> > > mem_cgroup_calculate_protection() is not stateless and should only be used > as part of a top-down tree traversal. shrink_one() traverses the per-node > memcg LRU instead of the root_mem_cgroup tree, and therefore it should not > call mem_cgroup_calculate_protection(). > > The existing misuse in shrink_one() can cause ineffective protection of > sub-trees that are grandchildren of root_mem_cgroup. Fix it by reusing > lru_gen_age_node(), which already traverses the root_mem_cgroup tree, to > calculate the protection. > > Previously lru_gen_age_node() opportunistically skips the first pass, > i.e., when scan_control->priority is DEF_PRIORITY. On the second pass, > lruvec_is_sizable() uses appropriate scan_control->priority, set by > set_initial_priority() from lru_gen_shrink_node(), to decide whether a > memcg is too small to reclaim from. > > Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup tree. > So it should call set_initial_priority() upfront, to make sure > lruvec_is_sizable() uses appropriate scan_control->priority on the first > pass. Otherwise, lruvec_is_reclaimable() can return false negatives and > result in premature OOM kills when min_ttl_ms is used. > > Link: https://lkml.kernel.org/r/20240712232956.1427127-1-yuzhao@xxxxxxxxxx > Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists") > Change-Id: I2ff1de0c7a3fae01370d99198d3a1b04c109aac6 > Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx> > Reported-by: T.J. Mercier <tjmercier@xxxxxxxxxx> > Cc: <stable@xxxxxxxxxxxxxxx> > Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > (cherry picked from commit 30d77b7eef019fa4422980806e8b7cdc8674493e) > [TJ: moved up the existing set_initial_priority from this branch > instead of the upstream version with changes from other patches] > Signed-off-by: T.J. Mercier <tjmercier@xxxxxxxxxx> > --- > mm/vmscan.c | 75 ++++++++++++++++++++++++----------------------------- > 1 file changed, 34 insertions(+), 41 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index e9d4c1f6d7bb..627c4d3b4c04 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -4545,6 +4545,28 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, > /****************************************************************************** > * working set protection > ******************************************************************************/ > +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) > +{ > + int priority; > + unsigned long reclaimable; > + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); > + > + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) > + return; > + /* > + * Determine the initial priority based on > + * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, > + * where reclaimed_to_scanned_ratio = inactive / total. > + */ > + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); > + if (get_swappiness(lruvec, sc)) > + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); > + > + /* round down reclaimable and round up sc->nr_to_reclaim */ > + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); > + > + sc->priority = clamp(priority, 0, DEF_PRIORITY); > +} > > static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) > { > @@ -4579,19 +4601,17 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc > struct mem_cgroup *memcg = lruvec_memcg(lruvec); > DEFINE_MIN_SEQ(lruvec); > > - /* see the comment on lru_gen_folio */ > - gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); > - birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); > - > - if (time_is_after_jiffies(birth + min_ttl)) > + if (mem_cgroup_below_min(NULL, memcg)) > return false; > > if (!lruvec_is_sizable(lruvec, sc)) > return false; > > - mem_cgroup_calculate_protection(NULL, memcg); > + /* see the comment on lru_gen_folio */ > + gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); > + birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); > > - return !mem_cgroup_below_min(NULL, memcg); > + return time_is_before_jiffies(birth + min_ttl); > } > > /* to protect the working set of the last N jiffies */ > @@ -4601,23 +4621,20 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) > { > struct mem_cgroup *memcg; > unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); > + bool reclaimable = !min_ttl; > > VM_WARN_ON_ONCE(!current_is_kswapd()); > > - /* check the order to exclude compaction-induced reclaim */ > - if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) > - return; > + set_initial_priority(pgdat, sc); > > memcg = mem_cgroup_iter(NULL, NULL, NULL); > do { > struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); > > - if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { > - mem_cgroup_iter_break(NULL, memcg); > - return; > - } > + mem_cgroup_calculate_protection(NULL, memcg); > > - cond_resched(); > + if (!reclaimable) > + reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl); > } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); > > /* > @@ -4625,7 +4642,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) > * younger than min_ttl. However, another possibility is all memcgs are > * either too small or below min. > */ > - if (mutex_trylock(&oom_lock)) { > + if (!reclaimable && mutex_trylock(&oom_lock)) { > struct oom_control oc = { > .gfp_mask = sc->gfp_mask, > }; > @@ -5425,8 +5442,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) > struct mem_cgroup *memcg = lruvec_memcg(lruvec); > struct pglist_data *pgdat = lruvec_pgdat(lruvec); > > - mem_cgroup_calculate_protection(NULL, memcg); > - > + /* lru_gen_age_node() called mem_cgroup_calculate_protection() */ > if (mem_cgroup_below_min(NULL, memcg)) > return MEMCG_LRU_YOUNG; > > @@ -5566,29 +5582,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc > > #endif > > -static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) > -{ > - int priority; > - unsigned long reclaimable; > - struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); > - > - if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) > - return; > - /* > - * Determine the initial priority based on > - * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, > - * where reclaimed_to_scanned_ratio = inactive / total. > - */ > - reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); > - if (get_swappiness(lruvec, sc)) > - reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); > - > - /* round down reclaimable and round up sc->nr_to_reclaim */ > - priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); > - > - sc->priority = clamp(priority, 0, DEF_PRIORITY); > -} > - > static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) > { > struct blk_plug plug; > -- > 2.46.0.rc1.232.g9752f9e123-goog > Please ignore this patch. I didn't see Yu's existing thread for this.