On 2024/12/10 16:24, Barry Song wrote: > On Tue, Dec 10, 2024 at 2:41 PM chenridong <chenridong@xxxxxxxxxx> wrote: >> >> >> >> On 2024/12/10 12:54, Barry Song wrote: >>> On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@xxxxxxxxxxxxxxx> wrote: >>>> >>>> From: Chen Ridong <chenridong@xxxxxxxxxx> >>>> >>>> The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back >>>> while isolated") only fixed the issue for mglru. However, this issue >>>> also exists in the traditional active/inactive LRU. This issue will be >>>> worse if THP is split, which makes the list longer and needs longer time >>>> to finish a batch of folios reclaim. >>>> >>>> This issue should be fixed in the same way for the traditional LRU. >>>> Therefore, the common logic was extracted to the 'find_folios_written_back' >>>> function firstly, which is then reused in the 'shrink_inactive_list' >>>> function. Finally, retry reclaiming those folios that may have missed the >>>> rotation for traditional LRU. >>> >>> let's drop the cover-letter and refine the changelog. >>> >> Will update. >> >>>> >>>> Signed-off-by: Chen Ridong <chenridong@xxxxxxxxxx> >>>> --- >>>> include/linux/mmzone.h | 3 +- >>>> mm/vmscan.c | 108 +++++++++++++++++++++++++++++------------ >>>> 2 files changed, 77 insertions(+), 34 deletions(-) >>>> >>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h >>>> index b36124145a16..47c6e8c43dcd 100644 >>>> --- a/include/linux/mmzone.h >>>> +++ b/include/linux/mmzone.h >>>> @@ -391,6 +391,7 @@ struct page_vma_mapped_walk; >>>> >>>> #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) >>>> #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) >>>> +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) >>>> >>>> #ifdef CONFIG_LRU_GEN >>>> >>>> @@ -406,8 +407,6 @@ enum { >>>> NR_LRU_GEN_CAPS >>>> }; >>>> >>>> -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) >>>> - >>>> #define MIN_LRU_BATCH BITS_PER_LONG >>>> #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) >>>> >>>> diff --git a/mm/vmscan.c b/mm/vmscan.c >>>> index 76378bc257e3..1f0d194f8b2f 100644 >>>> --- a/mm/vmscan.c >>>> +++ b/mm/vmscan.c >>>> @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task, >>>> task->reclaim_state = rs; >>>> } >>>> >>>> +/** >>>> + * find_folios_written_back - Find and move the written back folios to a new list. >>>> + * @list: filios list >>>> + * @clean: the written back folios list >>>> + * @skip: whether skip to move the written back folios to clean list. >>>> + */ >>>> +static inline void find_folios_written_back(struct list_head *list, >>>> + struct list_head *clean, bool skip) >>>> +{ >>>> + struct folio *folio; >>>> + struct folio *next; >>>> + >>>> + list_for_each_entry_safe_reverse(folio, next, list, lru) { >>>> + if (!folio_evictable(folio)) { >>>> + list_del(&folio->lru); >>>> + folio_putback_lru(folio); >>>> + continue; >>>> + } >>>> + >>>> + if (folio_test_reclaim(folio) && >>>> + (folio_test_dirty(folio) || folio_test_writeback(folio))) { >>>> + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ >>>> + if (lru_gen_enabled() && folio_test_workingset(folio)) >>>> + folio_set_referenced(folio); >>>> + continue; >>>> + } >>>> + >>>> + if (skip || folio_test_active(folio) || folio_test_referenced(folio) || >>>> + folio_mapped(folio) || folio_test_locked(folio) || >>>> + folio_test_dirty(folio) || folio_test_writeback(folio)) { >>>> + /* don't add rejected folios to the oldest generation */ >>>> + if (lru_gen_enabled()) >>>> + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, >>>> + BIT(PG_active)); >>>> + continue; >>>> + } >>>> + >>>> + /* retry folios that may have missed folio_rotate_reclaimable() */ >>>> + list_move(&folio->lru, clean); >>>> + } >>>> +} >>>> + >>>> /* >>>> * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to >>>> * scan_control->nr_reclaimed. >>>> @@ -1907,6 +1949,25 @@ static int current_may_throttle(void) >>>> return !(current->flags & PF_LOCAL_THROTTLE); >>>> } >>>> >>>> +static inline void acc_reclaimed_stat(struct reclaim_stat *stat, >>>> + struct reclaim_stat *curr) >>>> +{ >>>> + int i; >>>> + >>>> + stat->nr_dirty += curr->nr_dirty; >>>> + stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; >>>> + stat->nr_congested += curr->nr_congested; >>>> + stat->nr_writeback += curr->nr_writeback; >>>> + stat->nr_immediate += curr->nr_immediate; >>>> + stat->nr_pageout += curr->nr_pageout; >>>> + stat->nr_ref_keep += curr->nr_ref_keep; >>>> + stat->nr_unmap_fail += curr->nr_unmap_fail; >>>> + stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; >>>> + stat->nr_demoted += curr->nr_demoted; >>>> + for (i = 0; i < ANON_AND_FILE; i++) >>>> + stat->nr_activate[i] = curr->nr_activate[i]; >>>> +} >>> >>> you had no this before, what's the purpose of this? >>> >> >> We may call shrink_folio_list twice, and the 'stat curr' will reset in >> the shrink_folio_list function. We should accumulate the stats as a >> whole, which will then be used to calculate the cost and return it to >> the caller. > > Does mglru have the same issue? If so, we may need to send a patch to > fix mglru's stat accounting as well. By the way, the code is rather > messy—could it be implemented as shown below instead? > I have checked the code (in the evict_folios function) again, and it appears that 'reclaimed' should correspond to sc->nr_reclaimed, which accumulates the results twice. Should I address this issue with a separate patch? if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); __count_memcg_events(memcg, item, reclaimed); __count_vm_events(PGSTEAL_ANON + type, reclaimed); > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 1f0d194f8b2f..40d2ddde21f5 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -1094,7 +1094,6 @@ static unsigned int shrink_folio_list(struct > list_head *folio_list, > struct swap_iocb *plug = NULL; > > folio_batch_init(&free_folios); > - memset(stat, 0, sizeof(*stat)); > cond_resched(); > do_demote_pass = can_demote(pgdat->node_id, sc); > > @@ -1949,25 +1948,6 @@ static int current_may_throttle(void) > return !(current->flags & PF_LOCAL_THROTTLE); > } > > -static inline void acc_reclaimed_stat(struct reclaim_stat *stat, > - struct reclaim_stat *curr) > -{ > - int i; > - > - stat->nr_dirty += curr->nr_dirty; > - stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; > - stat->nr_congested += curr->nr_congested; > - stat->nr_writeback += curr->nr_writeback; > - stat->nr_immediate += curr->nr_immediate; > - stat->nr_pageout += curr->nr_pageout; > - stat->nr_ref_keep += curr->nr_ref_keep; > - stat->nr_unmap_fail += curr->nr_unmap_fail; > - stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; > - stat->nr_demoted += curr->nr_demoted; > - for (i = 0; i < ANON_AND_FILE; i++) > - stat->nr_activate[i] = curr->nr_activate[i]; > -} > - > /* > * shrink_inactive_list() is a helper for shrink_node(). It returns the number > * of reclaimed pages > @@ -1981,7 +1961,7 @@ static unsigned long > shrink_inactive_list(unsigned long nr_to_scan, > unsigned long nr_scanned; > unsigned int nr_reclaimed = 0; > unsigned long nr_taken; > - struct reclaim_stat stat, curr; > + struct reclaim_stat stat; > bool file = is_file_lru(lru); > enum vm_event_item item; > struct pglist_data *pgdat = lruvec_pgdat(lruvec); > @@ -2022,9 +2002,8 @@ static unsigned long > shrink_inactive_list(unsigned long nr_to_scan, > > memset(&stat, 0, sizeof(stat)); > retry: > - nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); > + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &stat, false); > find_folios_written_back(&folio_list, &clean_list, skip_retry); > - acc_reclaimed_stat(&stat, &curr); > > spin_lock_irq(&lruvec->lru_lock); > move_folios_to_lru(lruvec, &folio_list); > This seems much better. But we have extras works to do: 1. In the shrink_folio_list function: --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1089,12 +1089,12 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, LIST_HEAD(ret_folios); LIST_HEAD(demote_folios); unsigned int nr_reclaimed = 0; - unsigned int pgactivate = 0; + unsigned int pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; + unsigned int nr_demote = 0; bool do_demote_pass; struct swap_iocb *plug = NULL; folio_batch_init(&free_folios); - memset(stat, 0, sizeof(*stat)); cond_resched(); do_demote_pass = can_demote(pgdat->node_id, sc); @@ -1558,7 +1558,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, /* Migrate folios selected for demotion */ stat->nr_demoted = demote_folio_list(&demote_folios, pgdat); - nr_reclaimed += stat->nr_demoted; + stat->nr_demoted += nr_demote; + nr_reclaimed += nr_demote; /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { /* Folios which weren't demoted go back on @folio_list */ @@ -1586,7 +1587,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, } } - pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; + pgactivate = stat->nr_activate[0] + stat->nr_activate[1] - pgactivate; mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); 2. Outsize of the shrink_folio_list function, The callers should memset the stat. If you think this will be better, I will update like this. >> >> Thanks, >> Ridong >> >>>> + >>>> /* >>>> * shrink_inactive_list() is a helper for shrink_node(). It returns the number >>>> * of reclaimed pages >>>> @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, >>>> enum lru_list lru) >>>> { >>>> LIST_HEAD(folio_list); >>>> + LIST_HEAD(clean_list); >>>> unsigned long nr_scanned; >>>> unsigned int nr_reclaimed = 0; >>>> unsigned long nr_taken; >>>> - struct reclaim_stat stat; >>>> + struct reclaim_stat stat, curr; >>>> bool file = is_file_lru(lru); >>>> enum vm_event_item item; >>>> struct pglist_data *pgdat = lruvec_pgdat(lruvec); >>>> bool stalled = false; >>>> + bool skip_retry = false; >>>> >>>> while (unlikely(too_many_isolated(pgdat, file, sc))) { >>>> if (stalled) >>>> @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, >>>> if (nr_taken == 0) >>>> return 0; >>>> >>>> - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); >>>> + memset(&stat, 0, sizeof(stat)); >>>> +retry: >>>> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); >>>> + find_folios_written_back(&folio_list, &clean_list, skip_retry); >>>> + acc_reclaimed_stat(&stat, &curr); >>>> >>>> spin_lock_irq(&lruvec->lru_lock); >>>> move_folios_to_lru(lruvec, &folio_list); >>>> + if (!list_empty(&clean_list)) { >>>> + list_splice_init(&clean_list, &folio_list); >>>> + skip_retry = true; >>>> + spin_unlock_irq(&lruvec->lru_lock); >>>> + goto retry; > > This is rather confusing. We're still jumping to retry even though > skip_retry=true is set. Can we find a clearer approach for this? > > It was somewhat acceptable before we introduced the extracted > function find_folios_written_back(). However, it has become > harder to follow now that skip_retry is passed across functions. > > I find renaming skip_retry to is_retry more intuitive. The logic > is that since we are already retrying, find_folios_written_back() > shouldn’t move folios to the clean list again. The intended semantics > are: we have retris, don’t retry again. > Reasonable. Will update. Thanks, Ridong > >>>> + } >>>> >>>> __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), >>>> stat.nr_demoted); >>>> @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap >>>> int reclaimed; >>>> LIST_HEAD(list); >>>> LIST_HEAD(clean); >>>> - struct folio *folio; >>>> - struct folio *next; >>>> enum vm_event_item item; >>>> struct reclaim_stat stat; >>>> struct lru_gen_mm_walk *walk; >>>> @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap >>>> scanned, reclaimed, &stat, sc->priority, >>>> type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); >>>> >>>> - list_for_each_entry_safe_reverse(folio, next, &list, lru) { >>>> - if (!folio_evictable(folio)) { >>>> - list_del(&folio->lru); >>>> - folio_putback_lru(folio); >>>> - continue; >>>> - } >>>> - >>>> - if (folio_test_reclaim(folio) && >>>> - (folio_test_dirty(folio) || folio_test_writeback(folio))) { >>>> - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ >>>> - if (folio_test_workingset(folio)) >>>> - folio_set_referenced(folio); >>>> - continue; >>>> - } >>>> - >>>> - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || >>>> - folio_mapped(folio) || folio_test_locked(folio) || >>>> - folio_test_dirty(folio) || folio_test_writeback(folio)) { >>>> - /* don't add rejected folios to the oldest generation */ >>>> - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, >>>> - BIT(PG_active)); >>>> - continue; >>>> - } >>>> - >>>> - /* retry folios that may have missed folio_rotate_reclaimable() */ >>>> - list_move(&folio->lru, &clean); >>>> - } >>>> - >>>> + find_folios_written_back(&list, &clean, skip_retry); >>>> spin_lock_irq(&lruvec->lru_lock); >>>> >>>> move_folios_to_lru(lruvec, &list); >>>> -- >>>> 2.34.1 >>>> >>> > > Thanks > Barry