Re: [PATCH 6.6.y] mm/mglru: fix ineffective protection calculation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Jul 29, 2024 at 9:52 AM T.J. Mercier <tjmercier@xxxxxxxxxx> wrote:
>
> From: Yu Zhao <yuzhao@xxxxxxxxxx>
>
> mem_cgroup_calculate_protection() is not stateless and should only be used
> as part of a top-down tree traversal.  shrink_one() traverses the per-node
> memcg LRU instead of the root_mem_cgroup tree, and therefore it should not
> call mem_cgroup_calculate_protection().
>
> The existing misuse in shrink_one() can cause ineffective protection of
> sub-trees that are grandchildren of root_mem_cgroup.  Fix it by reusing
> lru_gen_age_node(), which already traverses the root_mem_cgroup tree, to
> calculate the protection.
>
> Previously lru_gen_age_node() opportunistically skips the first pass,
> i.e., when scan_control->priority is DEF_PRIORITY.  On the second pass,
> lruvec_is_sizable() uses appropriate scan_control->priority, set by
> set_initial_priority() from lru_gen_shrink_node(), to decide whether a
> memcg is too small to reclaim from.
>
> Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup tree.
> So it should call set_initial_priority() upfront, to make sure
> lruvec_is_sizable() uses appropriate scan_control->priority on the first
> pass.  Otherwise, lruvec_is_reclaimable() can return false negatives and
> result in premature OOM kills when min_ttl_ms is used.
>
> Link: https://lkml.kernel.org/r/20240712232956.1427127-1-yuzhao@xxxxxxxxxx
> Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
> Change-Id: I2ff1de0c7a3fae01370d99198d3a1b04c109aac6
> Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx>
> Reported-by: T.J. Mercier <tjmercier@xxxxxxxxxx>
> Cc: <stable@xxxxxxxxxxxxxxx>
> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> (cherry picked from commit 30d77b7eef019fa4422980806e8b7cdc8674493e)
> [TJ: moved up the existing set_initial_priority from this branch
> instead of the upstream version with changes from other patches]
> Signed-off-by: T.J. Mercier <tjmercier@xxxxxxxxxx>
> ---
>  mm/vmscan.c | 75 ++++++++++++++++++++++++-----------------------------
>  1 file changed, 34 insertions(+), 41 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index e9d4c1f6d7bb..627c4d3b4c04 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4545,6 +4545,28 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
>  /******************************************************************************
>   *                          working set protection
>   ******************************************************************************/
> +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
> +{
> +       int priority;
> +       unsigned long reclaimable;
> +       struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
> +
> +       if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
> +               return;
> +       /*
> +        * Determine the initial priority based on
> +        * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
> +        * where reclaimed_to_scanned_ratio = inactive / total.
> +        */
> +       reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
> +       if (get_swappiness(lruvec, sc))
> +               reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
> +
> +       /* round down reclaimable and round up sc->nr_to_reclaim */
> +       priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
> +
> +       sc->priority = clamp(priority, 0, DEF_PRIORITY);
> +}
>
>  static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
>  {
> @@ -4579,19 +4601,17 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
>         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>         DEFINE_MIN_SEQ(lruvec);
>
> -       /* see the comment on lru_gen_folio */
> -       gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
> -       birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
> -
> -       if (time_is_after_jiffies(birth + min_ttl))
> +       if (mem_cgroup_below_min(NULL, memcg))
>                 return false;
>
>         if (!lruvec_is_sizable(lruvec, sc))
>                 return false;
>
> -       mem_cgroup_calculate_protection(NULL, memcg);
> +       /* see the comment on lru_gen_folio */
> +       gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
> +       birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
>
> -       return !mem_cgroup_below_min(NULL, memcg);
> +       return time_is_before_jiffies(birth + min_ttl);
>  }
>
>  /* to protect the working set of the last N jiffies */
> @@ -4601,23 +4621,20 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
>  {
>         struct mem_cgroup *memcg;
>         unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
> +       bool reclaimable = !min_ttl;
>
>         VM_WARN_ON_ONCE(!current_is_kswapd());
>
> -       /* check the order to exclude compaction-induced reclaim */
> -       if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
> -               return;
> +       set_initial_priority(pgdat, sc);
>
>         memcg = mem_cgroup_iter(NULL, NULL, NULL);
>         do {
>                 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
>
> -               if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
> -                       mem_cgroup_iter_break(NULL, memcg);
> -                       return;
> -               }
> +               mem_cgroup_calculate_protection(NULL, memcg);
>
> -               cond_resched();
> +               if (!reclaimable)
> +                       reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
>         } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
>
>         /*
> @@ -4625,7 +4642,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
>          * younger than min_ttl. However, another possibility is all memcgs are
>          * either too small or below min.
>          */
> -       if (mutex_trylock(&oom_lock)) {
> +       if (!reclaimable && mutex_trylock(&oom_lock)) {
>                 struct oom_control oc = {
>                         .gfp_mask = sc->gfp_mask,
>                 };
> @@ -5425,8 +5442,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
>         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>
> -       mem_cgroup_calculate_protection(NULL, memcg);
> -
> +       /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
>         if (mem_cgroup_below_min(NULL, memcg))
>                 return MEMCG_LRU_YOUNG;
>
> @@ -5566,29 +5582,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
>
>  #endif
>
> -static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
> -{
> -       int priority;
> -       unsigned long reclaimable;
> -       struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
> -
> -       if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
> -               return;
> -       /*
> -        * Determine the initial priority based on
> -        * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
> -        * where reclaimed_to_scanned_ratio = inactive / total.
> -        */
> -       reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
> -       if (get_swappiness(lruvec, sc))
> -               reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
> -
> -       /* round down reclaimable and round up sc->nr_to_reclaim */
> -       priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
> -
> -       sc->priority = clamp(priority, 0, DEF_PRIORITY);
> -}
> -
>  static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
>  {
>         struct blk_plug plug;
> --
> 2.46.0.rc1.232.g9752f9e123-goog
>

Please ignore this patch. I didn't see Yu's existing thread for this.





[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux