Re: [PATCH v4 1/1] mm: vmascan: retry folios written back while isolated for traditional LRU

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Dec 11, 2024 at 1:11 AM chenridong <chenridong@xxxxxxxxxx> wrote:
>
>
>
> On 2024/12/10 16:24, Barry Song wrote:
> > On Tue, Dec 10, 2024 at 2:41 PM chenridong <chenridong@xxxxxxxxxx> wrote:
> >>
> >>
> >>
> >> On 2024/12/10 12:54, Barry Song wrote:
> >>> On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@xxxxxxxxxxxxxxx> wrote:
> >>>>
> >>>> From: Chen Ridong <chenridong@xxxxxxxxxx>
> >>>>
> >>>> The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back
> >>>> while isolated") only fixed the issue for mglru. However, this issue
> >>>> also exists in the traditional active/inactive LRU. This issue will be
> >>>> worse if THP is split, which makes the list longer and needs longer time
> >>>> to finish a batch of folios reclaim.
> >>>>
> >>>> This issue should be fixed in the same way for the traditional LRU.
> >>>> Therefore, the common logic was extracted to the 'find_folios_written_back'
> >>>> function firstly, which is then reused in the 'shrink_inactive_list'
> >>>> function. Finally, retry reclaiming those folios that may have missed the
> >>>> rotation for traditional LRU.
> >>>
> >>> let's drop the cover-letter and refine the changelog.
> >>>
> >> Will update.
> >>
> >>>>
> >>>> Signed-off-by: Chen Ridong <chenridong@xxxxxxxxxx>
> >>>> ---
> >>>>  include/linux/mmzone.h |   3 +-
> >>>>  mm/vmscan.c            | 108 +++++++++++++++++++++++++++++------------
> >>>>  2 files changed, 77 insertions(+), 34 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> >>>> index b36124145a16..47c6e8c43dcd 100644
> >>>> --- a/include/linux/mmzone.h
> >>>> +++ b/include/linux/mmzone.h
> >>>> @@ -391,6 +391,7 @@ struct page_vma_mapped_walk;
> >>>>
> >>>>  #define LRU_GEN_MASK           ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
> >>>>  #define LRU_REFS_MASK          ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
> >>>> +#define LRU_REFS_FLAGS         (BIT(PG_referenced) | BIT(PG_workingset))
> >>>>
> >>>>  #ifdef CONFIG_LRU_GEN
> >>>>
> >>>> @@ -406,8 +407,6 @@ enum {
> >>>>         NR_LRU_GEN_CAPS
> >>>>  };
> >>>>
> >>>> -#define LRU_REFS_FLAGS         (BIT(PG_referenced) | BIT(PG_workingset))
> >>>> -
> >>>>  #define MIN_LRU_BATCH          BITS_PER_LONG
> >>>>  #define MAX_LRU_BATCH          (MIN_LRU_BATCH * 64)
> >>>>
> >>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >>>> index 76378bc257e3..1f0d194f8b2f 100644
> >>>> --- a/mm/vmscan.c
> >>>> +++ b/mm/vmscan.c
> >>>> @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task,
> >>>>         task->reclaim_state = rs;
> >>>>  }
> >>>>
> >>>> +/**
> >>>> + * find_folios_written_back - Find and move the written back folios to a new list.
> >>>> + * @list: filios list
> >>>> + * @clean: the written back folios list
> >>>> + * @skip: whether skip to move the written back folios to clean list.
> >>>> + */
> >>>> +static inline void find_folios_written_back(struct list_head *list,
> >>>> +               struct list_head *clean, bool skip)
> >>>> +{
> >>>> +       struct folio *folio;
> >>>> +       struct folio *next;
> >>>> +
> >>>> +       list_for_each_entry_safe_reverse(folio, next, list, lru) {
> >>>> +               if (!folio_evictable(folio)) {
> >>>> +                       list_del(&folio->lru);
> >>>> +                       folio_putback_lru(folio);
> >>>> +                       continue;
> >>>> +               }
> >>>> +
> >>>> +               if (folio_test_reclaim(folio) &&
> >>>> +                   (folio_test_dirty(folio) || folio_test_writeback(folio))) {
> >>>> +                       /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
> >>>> +                       if (lru_gen_enabled() && folio_test_workingset(folio))
> >>>> +                               folio_set_referenced(folio);
> >>>> +                       continue;
> >>>> +               }
> >>>> +
> >>>> +               if (skip || folio_test_active(folio) || folio_test_referenced(folio) ||
> >>>> +                   folio_mapped(folio) || folio_test_locked(folio) ||
> >>>> +                   folio_test_dirty(folio) || folio_test_writeback(folio)) {
> >>>> +                       /* don't add rejected folios to the oldest generation */
> >>>> +                       if (lru_gen_enabled())
> >>>> +                               set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
> >>>> +                                             BIT(PG_active));
> >>>> +                       continue;
> >>>> +               }
> >>>> +
> >>>> +               /* retry folios that may have missed folio_rotate_reclaimable() */
> >>>> +               list_move(&folio->lru, clean);
> >>>> +       }
> >>>> +}
> >>>> +
> >>>>  /*
> >>>>   * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
> >>>>   * scan_control->nr_reclaimed.
> >>>> @@ -1907,6 +1949,25 @@ static int current_may_throttle(void)
> >>>>         return !(current->flags & PF_LOCAL_THROTTLE);
> >>>>  }
> >>>>
> >>>> +static inline void acc_reclaimed_stat(struct reclaim_stat *stat,
> >>>> +               struct reclaim_stat *curr)
> >>>> +{
> >>>> +       int i;
> >>>> +
> >>>> +       stat->nr_dirty += curr->nr_dirty;
> >>>> +       stat->nr_unqueued_dirty += curr->nr_unqueued_dirty;
> >>>> +       stat->nr_congested += curr->nr_congested;
> >>>> +       stat->nr_writeback += curr->nr_writeback;
> >>>> +       stat->nr_immediate += curr->nr_immediate;
> >>>> +       stat->nr_pageout += curr->nr_pageout;
> >>>> +       stat->nr_ref_keep += curr->nr_ref_keep;
> >>>> +       stat->nr_unmap_fail += curr->nr_unmap_fail;
> >>>> +       stat->nr_lazyfree_fail += curr->nr_lazyfree_fail;
> >>>> +       stat->nr_demoted += curr->nr_demoted;
> >>>> +       for (i = 0; i < ANON_AND_FILE; i++)
> >>>> +               stat->nr_activate[i] = curr->nr_activate[i];
> >>>> +}
> >>>
> >>> you had no this before, what's the purpose of this?
> >>>
> >>
> >> We may call shrink_folio_list twice, and the 'stat curr' will reset in
> >> the shrink_folio_list function. We should accumulate the stats as a
> >> whole, which will then be used to calculate the cost and return it to
> >> the caller.
> >
> > Does mglru have the same issue? If so, we may need to send a patch to
> > fix mglru's stat accounting as well. By the way, the code is rather
> > messy—could it be implemented as shown below instead?
> >
>
> I have checked the code (in the evict_folios function) again, and it
> appears that 'reclaimed' should correspond to sc->nr_reclaimed, which
> accumulates the results twice. Should I address this issue with a
> separate patch?

I don't think there is any problem.

        reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
        sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
        sc->nr_reclaimed += reclaimed;

reclaimed is always the number of pages we have reclaimed in
shrink_folio_list() no matter if it is retry or not.

>
> if (!cgroup_reclaim(sc))
>         __count_vm_events(item, reclaimed);
> __count_memcg_events(memcg, item, reclaimed);
> __count_vm_events(PGSTEAL_ANON + type, reclaimed);
>
>
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 1f0d194f8b2f..40d2ddde21f5 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1094,7 +1094,6 @@ static unsigned int shrink_folio_list(struct
> > list_head *folio_list,
> >   struct swap_iocb *plug = NULL;
> >
> >   folio_batch_init(&free_folios);
> > - memset(stat, 0, sizeof(*stat));
> >   cond_resched();
> >   do_demote_pass = can_demote(pgdat->node_id, sc);
> >
> > @@ -1949,25 +1948,6 @@ static int current_may_throttle(void)
> >   return !(current->flags & PF_LOCAL_THROTTLE);
> >  }
> >
> > -static inline void acc_reclaimed_stat(struct reclaim_stat *stat,
> > - struct reclaim_stat *curr)
> > -{
> > - int i;
> > -
> > - stat->nr_dirty += curr->nr_dirty;
> > - stat->nr_unqueued_dirty += curr->nr_unqueued_dirty;
> > - stat->nr_congested += curr->nr_congested;
> > - stat->nr_writeback += curr->nr_writeback;
> > - stat->nr_immediate += curr->nr_immediate;
> > - stat->nr_pageout += curr->nr_pageout;
> > - stat->nr_ref_keep += curr->nr_ref_keep;
> > - stat->nr_unmap_fail += curr->nr_unmap_fail;
> > - stat->nr_lazyfree_fail += curr->nr_lazyfree_fail;
> > - stat->nr_demoted += curr->nr_demoted;
> > - for (i = 0; i < ANON_AND_FILE; i++)
> > - stat->nr_activate[i] = curr->nr_activate[i];
> > -}
> > -
> >  /*
> >   * shrink_inactive_list() is a helper for shrink_node().  It returns the number
> >   * of reclaimed pages
> > @@ -1981,7 +1961,7 @@ static unsigned long
> > shrink_inactive_list(unsigned long nr_to_scan,
> >   unsigned long nr_scanned;
> >   unsigned int nr_reclaimed = 0;
> >   unsigned long nr_taken;
> > - struct reclaim_stat stat, curr;
> > + struct reclaim_stat stat;
> >   bool file = is_file_lru(lru);
> >   enum vm_event_item item;
> >   struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> > @@ -2022,9 +2002,8 @@ static unsigned long
> > shrink_inactive_list(unsigned long nr_to_scan,
> >
> >   memset(&stat, 0, sizeof(stat));
> >  retry:
> > - nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false);
> > + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
> >   find_folios_written_back(&folio_list, &clean_list, skip_retry);
> > - acc_reclaimed_stat(&stat, &curr);
> >
> >   spin_lock_irq(&lruvec->lru_lock);
> >   move_folios_to_lru(lruvec, &folio_list);
> >
>
> This seems much better. But we have extras works to do:
>
> 1. In the shrink_folio_list function:
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1089,12 +1089,12 @@ static unsigned int shrink_folio_list(struct
> list_head *folio_list,
>         LIST_HEAD(ret_folios);
>         LIST_HEAD(demote_folios);
>         unsigned int nr_reclaimed = 0;
> -       unsigned int pgactivate = 0;
> +       unsigned int pgactivate = stat->nr_activate[0] +
> stat->nr_activate[1];
> +       unsigned int nr_demote = 0;
>         bool do_demote_pass;
>         struct swap_iocb *plug = NULL;
>
>         folio_batch_init(&free_folios);
> -       memset(stat, 0, sizeof(*stat));
>         cond_resched();
>         do_demote_pass = can_demote(pgdat->node_id, sc);
>
> @@ -1558,7 +1558,8 @@ static unsigned int shrink_folio_list(struct
> list_head *folio_list,
>
>         /* Migrate folios selected for demotion */
>         stat->nr_demoted = demote_folio_list(&demote_folios, pgdat);
> -       nr_reclaimed += stat->nr_demoted;
> +       stat->nr_demoted += nr_demote;
> +       nr_reclaimed += nr_demote;
>         /* Folios that could not be demoted are still in @demote_folios */
>         if (!list_empty(&demote_folios)) {
>                 /* Folios which weren't demoted go back on @folio_list */
> @@ -1586,7 +1587,7 @@ static unsigned int shrink_folio_list(struct
> list_head *folio_list,
>                 }
>         }
>
> -       pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
> +       pgactivate = stat->nr_activate[0] + stat->nr_activate[1] -
> pgactivate;
>
>         mem_cgroup_uncharge_folios(&free_folios);
>         try_to_unmap_flush();
>
>
> 2. Outsize of the shrink_folio_list function, The callers should memset
> the stat.
>
> If you think this will be better, I will update like this.

no. Please goto retry after you have collected all you need from
`stat` just as mglru
is doing, drop the "curr" and acc_reclaimed_stat().

        sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
        sc->nr_reclaimed += reclaimed;

move_folios_to_lru() has helped moving all uninterested folios back to lruvec
before you retry. There is no duplicated counting.

>
> >>
> >> Thanks,
> >> Ridong
> >>
> >>>> +
> >>>>  /*
> >>>>   * shrink_inactive_list() is a helper for shrink_node().  It returns the number
> >>>>   * of reclaimed pages
> >>>> @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> >>>>                 enum lru_list lru)
> >>>>  {
> >>>>         LIST_HEAD(folio_list);
> >>>> +       LIST_HEAD(clean_list);
> >>>>         unsigned long nr_scanned;
> >>>>         unsigned int nr_reclaimed = 0;
> >>>>         unsigned long nr_taken;
> >>>> -       struct reclaim_stat stat;
> >>>> +       struct reclaim_stat stat, curr;
> >>>>         bool file = is_file_lru(lru);
> >>>>         enum vm_event_item item;
> >>>>         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> >>>>         bool stalled = false;
> >>>> +       bool skip_retry = false;
> >>>>
> >>>>         while (unlikely(too_many_isolated(pgdat, file, sc))) {
> >>>>                 if (stalled)
> >>>> @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> >>>>         if (nr_taken == 0)
> >>>>                 return 0;
> >>>>
> >>>> -       nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
> >>>> +       memset(&stat, 0, sizeof(stat));
> >>>> +retry:
> >>>> +       nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false);
> >>>> +       find_folios_written_back(&folio_list, &clean_list, skip_retry);
> >>>> +       acc_reclaimed_stat(&stat, &curr);
> >>>>
> >>>>         spin_lock_irq(&lruvec->lru_lock);
> >>>>         move_folios_to_lru(lruvec, &folio_list);
> >>>> +       if (!list_empty(&clean_list)) {
> >>>> +               list_splice_init(&clean_list, &folio_list);
> >>>> +               skip_retry = true;
> >>>> +               spin_unlock_irq(&lruvec->lru_lock);
> >>>> +               goto retry;
> >
> > This is rather confusing. We're still jumping to retry even though
> > skip_retry=true is set. Can we find a clearer approach for this?
> >
> > It was somewhat acceptable before we introduced the extracted
> > function find_folios_written_back(). However, it has become
> > harder to follow now that skip_retry is passed across functions.
> >
> > I find renaming skip_retry to is_retry more intuitive. The logic
> > is that since we are already retrying, find_folios_written_back()
> > shouldn’t move folios to the clean list again. The intended semantics
> > are: we have retris, don’t retry again.
> >
>
> Reasonable. Will update.
>
> Thanks,
> Ridong
>
> >
> >>>> +       }
> >>>>
> >>>>         __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
> >>>>                                         stat.nr_demoted);
> >>>> @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
> >>>>         int reclaimed;
> >>>>         LIST_HEAD(list);
> >>>>         LIST_HEAD(clean);
> >>>> -       struct folio *folio;
> >>>> -       struct folio *next;
> >>>>         enum vm_event_item item;
> >>>>         struct reclaim_stat stat;
> >>>>         struct lru_gen_mm_walk *walk;
> >>>> @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
> >>>>                         scanned, reclaimed, &stat, sc->priority,
> >>>>                         type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
> >>>>
> >>>> -       list_for_each_entry_safe_reverse(folio, next, &list, lru) {
> >>>> -               if (!folio_evictable(folio)) {
> >>>> -                       list_del(&folio->lru);
> >>>> -                       folio_putback_lru(folio);
> >>>> -                       continue;
> >>>> -               }
> >>>> -
> >>>> -               if (folio_test_reclaim(folio) &&
> >>>> -                   (folio_test_dirty(folio) || folio_test_writeback(folio))) {
> >>>> -                       /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
> >>>> -                       if (folio_test_workingset(folio))
> >>>> -                               folio_set_referenced(folio);
> >>>> -                       continue;
> >>>> -               }
> >>>> -
> >>>> -               if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
> >>>> -                   folio_mapped(folio) || folio_test_locked(folio) ||
> >>>> -                   folio_test_dirty(folio) || folio_test_writeback(folio)) {
> >>>> -                       /* don't add rejected folios to the oldest generation */
> >>>> -                       set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
> >>>> -                                     BIT(PG_active));
> >>>> -                       continue;
> >>>> -               }
> >>>> -
> >>>> -               /* retry folios that may have missed folio_rotate_reclaimable() */
> >>>> -               list_move(&folio->lru, &clean);
> >>>> -       }
> >>>> -
> >>>> +       find_folios_written_back(&list, &clean, skip_retry);
> >>>>         spin_lock_irq(&lruvec->lru_lock);
> >>>>
> >>>>         move_folios_to_lru(lruvec, &list);
> >>>> --
> >>>> 2.34.1
> >>>>
> >>>
> >
> > Thanks
> >  Barry





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux