On Wed, Dec 11, 2024 at 1:11 AM chenridong <chenridong@xxxxxxxxxx> wrote: > > > > On 2024/12/10 16:24, Barry Song wrote: > > On Tue, Dec 10, 2024 at 2:41 PM chenridong <chenridong@xxxxxxxxxx> wrote: > >> > >> > >> > >> On 2024/12/10 12:54, Barry Song wrote: > >>> On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@xxxxxxxxxxxxxxx> wrote: > >>>> > >>>> From: Chen Ridong <chenridong@xxxxxxxxxx> > >>>> > >>>> The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back > >>>> while isolated") only fixed the issue for mglru. However, this issue > >>>> also exists in the traditional active/inactive LRU. This issue will be > >>>> worse if THP is split, which makes the list longer and needs longer time > >>>> to finish a batch of folios reclaim. > >>>> > >>>> This issue should be fixed in the same way for the traditional LRU. > >>>> Therefore, the common logic was extracted to the 'find_folios_written_back' > >>>> function firstly, which is then reused in the 'shrink_inactive_list' > >>>> function. Finally, retry reclaiming those folios that may have missed the > >>>> rotation for traditional LRU. > >>> > >>> let's drop the cover-letter and refine the changelog. > >>> > >> Will update. > >> > >>>> > >>>> Signed-off-by: Chen Ridong <chenridong@xxxxxxxxxx> > >>>> --- > >>>> include/linux/mmzone.h | 3 +- > >>>> mm/vmscan.c | 108 +++++++++++++++++++++++++++++------------ > >>>> 2 files changed, 77 insertions(+), 34 deletions(-) > >>>> > >>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > >>>> index b36124145a16..47c6e8c43dcd 100644 > >>>> --- a/include/linux/mmzone.h > >>>> +++ b/include/linux/mmzone.h > >>>> @@ -391,6 +391,7 @@ struct page_vma_mapped_walk; > >>>> > >>>> #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) > >>>> #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) > >>>> +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) > >>>> > >>>> #ifdef CONFIG_LRU_GEN > >>>> > >>>> @@ -406,8 +407,6 @@ enum { > >>>> NR_LRU_GEN_CAPS > >>>> }; > >>>> > >>>> -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) > >>>> - > >>>> #define MIN_LRU_BATCH BITS_PER_LONG > >>>> #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) > >>>> > >>>> diff --git a/mm/vmscan.c b/mm/vmscan.c > >>>> index 76378bc257e3..1f0d194f8b2f 100644 > >>>> --- a/mm/vmscan.c > >>>> +++ b/mm/vmscan.c > >>>> @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task, > >>>> task->reclaim_state = rs; > >>>> } > >>>> > >>>> +/** > >>>> + * find_folios_written_back - Find and move the written back folios to a new list. > >>>> + * @list: filios list > >>>> + * @clean: the written back folios list > >>>> + * @skip: whether skip to move the written back folios to clean list. > >>>> + */ > >>>> +static inline void find_folios_written_back(struct list_head *list, > >>>> + struct list_head *clean, bool skip) > >>>> +{ > >>>> + struct folio *folio; > >>>> + struct folio *next; > >>>> + > >>>> + list_for_each_entry_safe_reverse(folio, next, list, lru) { > >>>> + if (!folio_evictable(folio)) { > >>>> + list_del(&folio->lru); > >>>> + folio_putback_lru(folio); > >>>> + continue; > >>>> + } > >>>> + > >>>> + if (folio_test_reclaim(folio) && > >>>> + (folio_test_dirty(folio) || folio_test_writeback(folio))) { > >>>> + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ > >>>> + if (lru_gen_enabled() && folio_test_workingset(folio)) > >>>> + folio_set_referenced(folio); > >>>> + continue; > >>>> + } > >>>> + > >>>> + if (skip || folio_test_active(folio) || folio_test_referenced(folio) || > >>>> + folio_mapped(folio) || folio_test_locked(folio) || > >>>> + folio_test_dirty(folio) || folio_test_writeback(folio)) { > >>>> + /* don't add rejected folios to the oldest generation */ > >>>> + if (lru_gen_enabled()) > >>>> + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, > >>>> + BIT(PG_active)); > >>>> + continue; > >>>> + } > >>>> + > >>>> + /* retry folios that may have missed folio_rotate_reclaimable() */ > >>>> + list_move(&folio->lru, clean); > >>>> + } > >>>> +} > >>>> + > >>>> /* > >>>> * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to > >>>> * scan_control->nr_reclaimed. > >>>> @@ -1907,6 +1949,25 @@ static int current_may_throttle(void) > >>>> return !(current->flags & PF_LOCAL_THROTTLE); > >>>> } > >>>> > >>>> +static inline void acc_reclaimed_stat(struct reclaim_stat *stat, > >>>> + struct reclaim_stat *curr) > >>>> +{ > >>>> + int i; > >>>> + > >>>> + stat->nr_dirty += curr->nr_dirty; > >>>> + stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; > >>>> + stat->nr_congested += curr->nr_congested; > >>>> + stat->nr_writeback += curr->nr_writeback; > >>>> + stat->nr_immediate += curr->nr_immediate; > >>>> + stat->nr_pageout += curr->nr_pageout; > >>>> + stat->nr_ref_keep += curr->nr_ref_keep; > >>>> + stat->nr_unmap_fail += curr->nr_unmap_fail; > >>>> + stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; > >>>> + stat->nr_demoted += curr->nr_demoted; > >>>> + for (i = 0; i < ANON_AND_FILE; i++) > >>>> + stat->nr_activate[i] = curr->nr_activate[i]; > >>>> +} > >>> > >>> you had no this before, what's the purpose of this? > >>> > >> > >> We may call shrink_folio_list twice, and the 'stat curr' will reset in > >> the shrink_folio_list function. We should accumulate the stats as a > >> whole, which will then be used to calculate the cost and return it to > >> the caller. > > > > Does mglru have the same issue? If so, we may need to send a patch to > > fix mglru's stat accounting as well. By the way, the code is rather > > messy—could it be implemented as shown below instead? > > > > I have checked the code (in the evict_folios function) again, and it > appears that 'reclaimed' should correspond to sc->nr_reclaimed, which > accumulates the results twice. Should I address this issue with a > separate patch? I don't think there is any problem. reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; reclaimed is always the number of pages we have reclaimed in shrink_folio_list() no matter if it is retry or not. > > if (!cgroup_reclaim(sc)) > __count_vm_events(item, reclaimed); > __count_memcg_events(memcg, item, reclaimed); > __count_vm_events(PGSTEAL_ANON + type, reclaimed); > > > > diff --git a/mm/vmscan.c b/mm/vmscan.c > > index 1f0d194f8b2f..40d2ddde21f5 100644 > > --- a/mm/vmscan.c > > +++ b/mm/vmscan.c > > @@ -1094,7 +1094,6 @@ static unsigned int shrink_folio_list(struct > > list_head *folio_list, > > struct swap_iocb *plug = NULL; > > > > folio_batch_init(&free_folios); > > - memset(stat, 0, sizeof(*stat)); > > cond_resched(); > > do_demote_pass = can_demote(pgdat->node_id, sc); > > > > @@ -1949,25 +1948,6 @@ static int current_may_throttle(void) > > return !(current->flags & PF_LOCAL_THROTTLE); > > } > > > > -static inline void acc_reclaimed_stat(struct reclaim_stat *stat, > > - struct reclaim_stat *curr) > > -{ > > - int i; > > - > > - stat->nr_dirty += curr->nr_dirty; > > - stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; > > - stat->nr_congested += curr->nr_congested; > > - stat->nr_writeback += curr->nr_writeback; > > - stat->nr_immediate += curr->nr_immediate; > > - stat->nr_pageout += curr->nr_pageout; > > - stat->nr_ref_keep += curr->nr_ref_keep; > > - stat->nr_unmap_fail += curr->nr_unmap_fail; > > - stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; > > - stat->nr_demoted += curr->nr_demoted; > > - for (i = 0; i < ANON_AND_FILE; i++) > > - stat->nr_activate[i] = curr->nr_activate[i]; > > -} > > - > > /* > > * shrink_inactive_list() is a helper for shrink_node(). It returns the number > > * of reclaimed pages > > @@ -1981,7 +1961,7 @@ static unsigned long > > shrink_inactive_list(unsigned long nr_to_scan, > > unsigned long nr_scanned; > > unsigned int nr_reclaimed = 0; > > unsigned long nr_taken; > > - struct reclaim_stat stat, curr; > > + struct reclaim_stat stat; > > bool file = is_file_lru(lru); > > enum vm_event_item item; > > struct pglist_data *pgdat = lruvec_pgdat(lruvec); > > @@ -2022,9 +2002,8 @@ static unsigned long > > shrink_inactive_list(unsigned long nr_to_scan, > > > > memset(&stat, 0, sizeof(stat)); > > retry: > > - nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); > > + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &stat, false); > > find_folios_written_back(&folio_list, &clean_list, skip_retry); > > - acc_reclaimed_stat(&stat, &curr); > > > > spin_lock_irq(&lruvec->lru_lock); > > move_folios_to_lru(lruvec, &folio_list); > > > > This seems much better. But we have extras works to do: > > 1. In the shrink_folio_list function: > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -1089,12 +1089,12 @@ static unsigned int shrink_folio_list(struct > list_head *folio_list, > LIST_HEAD(ret_folios); > LIST_HEAD(demote_folios); > unsigned int nr_reclaimed = 0; > - unsigned int pgactivate = 0; > + unsigned int pgactivate = stat->nr_activate[0] + > stat->nr_activate[1]; > + unsigned int nr_demote = 0; > bool do_demote_pass; > struct swap_iocb *plug = NULL; > > folio_batch_init(&free_folios); > - memset(stat, 0, sizeof(*stat)); > cond_resched(); > do_demote_pass = can_demote(pgdat->node_id, sc); > > @@ -1558,7 +1558,8 @@ static unsigned int shrink_folio_list(struct > list_head *folio_list, > > /* Migrate folios selected for demotion */ > stat->nr_demoted = demote_folio_list(&demote_folios, pgdat); > - nr_reclaimed += stat->nr_demoted; > + stat->nr_demoted += nr_demote; > + nr_reclaimed += nr_demote; > /* Folios that could not be demoted are still in @demote_folios */ > if (!list_empty(&demote_folios)) { > /* Folios which weren't demoted go back on @folio_list */ > @@ -1586,7 +1587,7 @@ static unsigned int shrink_folio_list(struct > list_head *folio_list, > } > } > > - pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; > + pgactivate = stat->nr_activate[0] + stat->nr_activate[1] - > pgactivate; > > mem_cgroup_uncharge_folios(&free_folios); > try_to_unmap_flush(); > > > 2. Outsize of the shrink_folio_list function, The callers should memset > the stat. > > If you think this will be better, I will update like this. no. Please goto retry after you have collected all you need from `stat` just as mglru is doing, drop the "curr" and acc_reclaimed_stat(). sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; move_folios_to_lru() has helped moving all uninterested folios back to lruvec before you retry. There is no duplicated counting. > > >> > >> Thanks, > >> Ridong > >> > >>>> + > >>>> /* > >>>> * shrink_inactive_list() is a helper for shrink_node(). It returns the number > >>>> * of reclaimed pages > >>>> @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, > >>>> enum lru_list lru) > >>>> { > >>>> LIST_HEAD(folio_list); > >>>> + LIST_HEAD(clean_list); > >>>> unsigned long nr_scanned; > >>>> unsigned int nr_reclaimed = 0; > >>>> unsigned long nr_taken; > >>>> - struct reclaim_stat stat; > >>>> + struct reclaim_stat stat, curr; > >>>> bool file = is_file_lru(lru); > >>>> enum vm_event_item item; > >>>> struct pglist_data *pgdat = lruvec_pgdat(lruvec); > >>>> bool stalled = false; > >>>> + bool skip_retry = false; > >>>> > >>>> while (unlikely(too_many_isolated(pgdat, file, sc))) { > >>>> if (stalled) > >>>> @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, > >>>> if (nr_taken == 0) > >>>> return 0; > >>>> > >>>> - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); > >>>> + memset(&stat, 0, sizeof(stat)); > >>>> +retry: > >>>> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); > >>>> + find_folios_written_back(&folio_list, &clean_list, skip_retry); > >>>> + acc_reclaimed_stat(&stat, &curr); > >>>> > >>>> spin_lock_irq(&lruvec->lru_lock); > >>>> move_folios_to_lru(lruvec, &folio_list); > >>>> + if (!list_empty(&clean_list)) { > >>>> + list_splice_init(&clean_list, &folio_list); > >>>> + skip_retry = true; > >>>> + spin_unlock_irq(&lruvec->lru_lock); > >>>> + goto retry; > > > > This is rather confusing. We're still jumping to retry even though > > skip_retry=true is set. Can we find a clearer approach for this? > > > > It was somewhat acceptable before we introduced the extracted > > function find_folios_written_back(). However, it has become > > harder to follow now that skip_retry is passed across functions. > > > > I find renaming skip_retry to is_retry more intuitive. The logic > > is that since we are already retrying, find_folios_written_back() > > shouldn’t move folios to the clean list again. The intended semantics > > are: we have retris, don’t retry again. > > > > Reasonable. Will update. > > Thanks, > Ridong > > > > >>>> + } > >>>> > >>>> __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), > >>>> stat.nr_demoted); > >>>> @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap > >>>> int reclaimed; > >>>> LIST_HEAD(list); > >>>> LIST_HEAD(clean); > >>>> - struct folio *folio; > >>>> - struct folio *next; > >>>> enum vm_event_item item; > >>>> struct reclaim_stat stat; > >>>> struct lru_gen_mm_walk *walk; > >>>> @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap > >>>> scanned, reclaimed, &stat, sc->priority, > >>>> type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); > >>>> > >>>> - list_for_each_entry_safe_reverse(folio, next, &list, lru) { > >>>> - if (!folio_evictable(folio)) { > >>>> - list_del(&folio->lru); > >>>> - folio_putback_lru(folio); > >>>> - continue; > >>>> - } > >>>> - > >>>> - if (folio_test_reclaim(folio) && > >>>> - (folio_test_dirty(folio) || folio_test_writeback(folio))) { > >>>> - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ > >>>> - if (folio_test_workingset(folio)) > >>>> - folio_set_referenced(folio); > >>>> - continue; > >>>> - } > >>>> - > >>>> - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || > >>>> - folio_mapped(folio) || folio_test_locked(folio) || > >>>> - folio_test_dirty(folio) || folio_test_writeback(folio)) { > >>>> - /* don't add rejected folios to the oldest generation */ > >>>> - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, > >>>> - BIT(PG_active)); > >>>> - continue; > >>>> - } > >>>> - > >>>> - /* retry folios that may have missed folio_rotate_reclaimable() */ > >>>> - list_move(&folio->lru, &clean); > >>>> - } > >>>> - > >>>> + find_folios_written_back(&list, &clean, skip_retry); > >>>> spin_lock_irq(&lruvec->lru_lock); > >>>> > >>>> move_folios_to_lru(lruvec, &list); > >>>> -- > >>>> 2.34.1 > >>>> > >>> > > > > Thanks > > Barry