On 2024/12/13 7:17, Barry Song wrote: > On Wed, Dec 11, 2024 at 1:11 AM chenridong <chenridong@xxxxxxxxxx> wrote: >> >> >> >> On 2024/12/10 16:24, Barry Song wrote: >>> On Tue, Dec 10, 2024 at 2:41 PM chenridong <chenridong@xxxxxxxxxx> wrote: >>>> >>>> >>>> >>>> On 2024/12/10 12:54, Barry Song wrote: >>>>> On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@xxxxxxxxxxxxxxx> wrote: >>>>>> >>>>>> From: Chen Ridong <chenridong@xxxxxxxxxx> >>>>>> >>>>>> The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back >>>>>> while isolated") only fixed the issue for mglru. However, this issue >>>>>> also exists in the traditional active/inactive LRU. This issue will be >>>>>> worse if THP is split, which makes the list longer and needs longer time >>>>>> to finish a batch of folios reclaim. >>>>>> >>>>>> This issue should be fixed in the same way for the traditional LRU. >>>>>> Therefore, the common logic was extracted to the 'find_folios_written_back' >>>>>> function firstly, which is then reused in the 'shrink_inactive_list' >>>>>> function. Finally, retry reclaiming those folios that may have missed the >>>>>> rotation for traditional LRU. >>>>> >>>>> let's drop the cover-letter and refine the changelog. >>>>> >>>> Will update. >>>> >>>>>> >>>>>> Signed-off-by: Chen Ridong <chenridong@xxxxxxxxxx> >>>>>> --- >>>>>> include/linux/mmzone.h | 3 +- >>>>>> mm/vmscan.c | 108 +++++++++++++++++++++++++++++------------ >>>>>> 2 files changed, 77 insertions(+), 34 deletions(-) >>>>>> >>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h >>>>>> index b36124145a16..47c6e8c43dcd 100644 >>>>>> --- a/include/linux/mmzone.h >>>>>> +++ b/include/linux/mmzone.h >>>>>> @@ -391,6 +391,7 @@ struct page_vma_mapped_walk; >>>>>> >>>>>> #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) >>>>>> #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) >>>>>> +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) >>>>>> >>>>>> #ifdef CONFIG_LRU_GEN >>>>>> >>>>>> @@ -406,8 +407,6 @@ enum { >>>>>> NR_LRU_GEN_CAPS >>>>>> }; >>>>>> >>>>>> -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) >>>>>> - >>>>>> #define MIN_LRU_BATCH BITS_PER_LONG >>>>>> #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) >>>>>> >>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c >>>>>> index 76378bc257e3..1f0d194f8b2f 100644 >>>>>> --- a/mm/vmscan.c >>>>>> +++ b/mm/vmscan.c >>>>>> @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task, >>>>>> task->reclaim_state = rs; >>>>>> } >>>>>> >>>>>> +/** >>>>>> + * find_folios_written_back - Find and move the written back folios to a new list. >>>>>> + * @list: filios list >>>>>> + * @clean: the written back folios list >>>>>> + * @skip: whether skip to move the written back folios to clean list. >>>>>> + */ >>>>>> +static inline void find_folios_written_back(struct list_head *list, >>>>>> + struct list_head *clean, bool skip) >>>>>> +{ >>>>>> + struct folio *folio; >>>>>> + struct folio *next; >>>>>> + >>>>>> + list_for_each_entry_safe_reverse(folio, next, list, lru) { >>>>>> + if (!folio_evictable(folio)) { >>>>>> + list_del(&folio->lru); >>>>>> + folio_putback_lru(folio); >>>>>> + continue; >>>>>> + } >>>>>> + >>>>>> + if (folio_test_reclaim(folio) && >>>>>> + (folio_test_dirty(folio) || folio_test_writeback(folio))) { >>>>>> + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ >>>>>> + if (lru_gen_enabled() && folio_test_workingset(folio)) >>>>>> + folio_set_referenced(folio); >>>>>> + continue; >>>>>> + } >>>>>> + >>>>>> + if (skip || folio_test_active(folio) || folio_test_referenced(folio) || >>>>>> + folio_mapped(folio) || folio_test_locked(folio) || >>>>>> + folio_test_dirty(folio) || folio_test_writeback(folio)) { >>>>>> + /* don't add rejected folios to the oldest generation */ >>>>>> + if (lru_gen_enabled()) >>>>>> + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, >>>>>> + BIT(PG_active)); >>>>>> + continue; >>>>>> + } >>>>>> + >>>>>> + /* retry folios that may have missed folio_rotate_reclaimable() */ >>>>>> + list_move(&folio->lru, clean); >>>>>> + } >>>>>> +} >>>>>> + >>>>>> /* >>>>>> * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to >>>>>> * scan_control->nr_reclaimed. >>>>>> @@ -1907,6 +1949,25 @@ static int current_may_throttle(void) >>>>>> return !(current->flags & PF_LOCAL_THROTTLE); >>>>>> } >>>>>> >>>>>> +static inline void acc_reclaimed_stat(struct reclaim_stat *stat, >>>>>> + struct reclaim_stat *curr) >>>>>> +{ >>>>>> + int i; >>>>>> + >>>>>> + stat->nr_dirty += curr->nr_dirty; >>>>>> + stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; >>>>>> + stat->nr_congested += curr->nr_congested; >>>>>> + stat->nr_writeback += curr->nr_writeback; >>>>>> + stat->nr_immediate += curr->nr_immediate; >>>>>> + stat->nr_pageout += curr->nr_pageout; >>>>>> + stat->nr_ref_keep += curr->nr_ref_keep; >>>>>> + stat->nr_unmap_fail += curr->nr_unmap_fail; >>>>>> + stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; >>>>>> + stat->nr_demoted += curr->nr_demoted; >>>>>> + for (i = 0; i < ANON_AND_FILE; i++) >>>>>> + stat->nr_activate[i] = curr->nr_activate[i]; >>>>>> +} >>>>> >>>>> you had no this before, what's the purpose of this? >>>>> >>>> >>>> We may call shrink_folio_list twice, and the 'stat curr' will reset in >>>> the shrink_folio_list function. We should accumulate the stats as a >>>> whole, which will then be used to calculate the cost and return it to >>>> the caller. >>> >>> Does mglru have the same issue? If so, we may need to send a patch to >>> fix mglru's stat accounting as well. By the way, the code is rather >>> messy—could it be implemented as shown below instead? >>> >> >> I have checked the code (in the evict_folios function) again, and it >> appears that 'reclaimed' should correspond to sc->nr_reclaimed, which >> accumulates the results twice. Should I address this issue with a >> separate patch? > > I don't think there is any problem. > > reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); > sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; > sc->nr_reclaimed += reclaimed; > > reclaimed is always the number of pages we have reclaimed in > shrink_folio_list() no matter if it is retry or not. > Thank you, I made a mistake. >> >> if (!cgroup_reclaim(sc)) >> __count_vm_events(item, reclaimed); >> __count_memcg_events(memcg, item, reclaimed); >> __count_vm_events(PGSTEAL_ANON + type, reclaimed); >> >> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c >>> index 1f0d194f8b2f..40d2ddde21f5 100644 >>> --- a/mm/vmscan.c >>> +++ b/mm/vmscan.c >>> @@ -1094,7 +1094,6 @@ static unsigned int shrink_folio_list(struct >>> list_head *folio_list, >>> struct swap_iocb *plug = NULL; >>> >>> folio_batch_init(&free_folios); >>> - memset(stat, 0, sizeof(*stat)); >>> cond_resched(); >>> do_demote_pass = can_demote(pgdat->node_id, sc); >>> >>> @@ -1949,25 +1948,6 @@ static int current_may_throttle(void) >>> return !(current->flags & PF_LOCAL_THROTTLE); >>> } >>> >>> -static inline void acc_reclaimed_stat(struct reclaim_stat *stat, >>> - struct reclaim_stat *curr) >>> -{ >>> - int i; >>> - >>> - stat->nr_dirty += curr->nr_dirty; >>> - stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; >>> - stat->nr_congested += curr->nr_congested; >>> - stat->nr_writeback += curr->nr_writeback; >>> - stat->nr_immediate += curr->nr_immediate; >>> - stat->nr_pageout += curr->nr_pageout; >>> - stat->nr_ref_keep += curr->nr_ref_keep; >>> - stat->nr_unmap_fail += curr->nr_unmap_fail; >>> - stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; >>> - stat->nr_demoted += curr->nr_demoted; >>> - for (i = 0; i < ANON_AND_FILE; i++) >>> - stat->nr_activate[i] = curr->nr_activate[i]; >>> -} >>> - >>> /* >>> * shrink_inactive_list() is a helper for shrink_node(). It returns the number >>> * of reclaimed pages >>> @@ -1981,7 +1961,7 @@ static unsigned long >>> shrink_inactive_list(unsigned long nr_to_scan, >>> unsigned long nr_scanned; >>> unsigned int nr_reclaimed = 0; >>> unsigned long nr_taken; >>> - struct reclaim_stat stat, curr; >>> + struct reclaim_stat stat; >>> bool file = is_file_lru(lru); >>> enum vm_event_item item; >>> struct pglist_data *pgdat = lruvec_pgdat(lruvec); >>> @@ -2022,9 +2002,8 @@ static unsigned long >>> shrink_inactive_list(unsigned long nr_to_scan, >>> >>> memset(&stat, 0, sizeof(stat)); >>> retry: >>> - nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); >>> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &stat, false); >>> find_folios_written_back(&folio_list, &clean_list, skip_retry); >>> - acc_reclaimed_stat(&stat, &curr); >>> >>> spin_lock_irq(&lruvec->lru_lock); >>> move_folios_to_lru(lruvec, &folio_list); >>> >> >> This seems much better. But we have extras works to do: >> >> 1. In the shrink_folio_list function: >> --- a/mm/vmscan.c >> +++ b/mm/vmscan.c >> @@ -1089,12 +1089,12 @@ static unsigned int shrink_folio_list(struct >> list_head *folio_list, >> LIST_HEAD(ret_folios); >> LIST_HEAD(demote_folios); >> unsigned int nr_reclaimed = 0; >> - unsigned int pgactivate = 0; >> + unsigned int pgactivate = stat->nr_activate[0] + >> stat->nr_activate[1]; >> + unsigned int nr_demote = 0; >> bool do_demote_pass; >> struct swap_iocb *plug = NULL; >> >> folio_batch_init(&free_folios); >> - memset(stat, 0, sizeof(*stat)); >> cond_resched(); >> do_demote_pass = can_demote(pgdat->node_id, sc); >> >> @@ -1558,7 +1558,8 @@ static unsigned int shrink_folio_list(struct >> list_head *folio_list, >> >> /* Migrate folios selected for demotion */ >> stat->nr_demoted = demote_folio_list(&demote_folios, pgdat); >> - nr_reclaimed += stat->nr_demoted; >> + stat->nr_demoted += nr_demote; >> + nr_reclaimed += nr_demote; >> /* Folios that could not be demoted are still in @demote_folios */ >> if (!list_empty(&demote_folios)) { >> /* Folios which weren't demoted go back on @folio_list */ >> @@ -1586,7 +1587,7 @@ static unsigned int shrink_folio_list(struct >> list_head *folio_list, >> } >> } >> >> - pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; >> + pgactivate = stat->nr_activate[0] + stat->nr_activate[1] - >> pgactivate; >> >> mem_cgroup_uncharge_folios(&free_folios); >> try_to_unmap_flush(); >> >> >> 2. Outsize of the shrink_folio_list function, The callers should memset >> the stat. >> >> If you think this will be better, I will update like this. > > no. Please goto retry after you have collected all you need from > `stat` just as mglru > is doing, drop the "curr" and acc_reclaimed_stat(). > > sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; > sc->nr_reclaimed += reclaimed; > > move_folios_to_lru() has helped moving all uninterested folios back to lruvec > before you retry. There is no duplicated counting. > See, thank you. Thanks, Ridong >> >>>> >>>> Thanks, >>>> Ridong >>>> >>>>>> + >>>>>> /* >>>>>> * shrink_inactive_list() is a helper for shrink_node(). It returns the number >>>>>> * of reclaimed pages >>>>>> @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, >>>>>> enum lru_list lru) >>>>>> { >>>>>> LIST_HEAD(folio_list); >>>>>> + LIST_HEAD(clean_list); >>>>>> unsigned long nr_scanned; >>>>>> unsigned int nr_reclaimed = 0; >>>>>> unsigned long nr_taken; >>>>>> - struct reclaim_stat stat; >>>>>> + struct reclaim_stat stat, curr; >>>>>> bool file = is_file_lru(lru); >>>>>> enum vm_event_item item; >>>>>> struct pglist_data *pgdat = lruvec_pgdat(lruvec); >>>>>> bool stalled = false; >>>>>> + bool skip_retry = false; >>>>>> >>>>>> while (unlikely(too_many_isolated(pgdat, file, sc))) { >>>>>> if (stalled) >>>>>> @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, >>>>>> if (nr_taken == 0) >>>>>> return 0; >>>>>> >>>>>> - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); >>>>>> + memset(&stat, 0, sizeof(stat)); >>>>>> +retry: >>>>>> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); >>>>>> + find_folios_written_back(&folio_list, &clean_list, skip_retry); >>>>>> + acc_reclaimed_stat(&stat, &curr); >>>>>> >>>>>> spin_lock_irq(&lruvec->lru_lock); >>>>>> move_folios_to_lru(lruvec, &folio_list); >>>>>> + if (!list_empty(&clean_list)) { >>>>>> + list_splice_init(&clean_list, &folio_list); >>>>>> + skip_retry = true; >>>>>> + spin_unlock_irq(&lruvec->lru_lock); >>>>>> + goto retry; >>> >>> This is rather confusing. We're still jumping to retry even though >>> skip_retry=true is set. Can we find a clearer approach for this? >>> >>> It was somewhat acceptable before we introduced the extracted >>> function find_folios_written_back(). However, it has become >>> harder to follow now that skip_retry is passed across functions. >>> >>> I find renaming skip_retry to is_retry more intuitive. The logic >>> is that since we are already retrying, find_folios_written_back() >>> shouldn’t move folios to the clean list again. The intended semantics >>> are: we have retris, don’t retry again. >>> >> >> Reasonable. Will update. >> >> Thanks, >> Ridong >> >>> >>>>>> + } >>>>>> >>>>>> __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), >>>>>> stat.nr_demoted); >>>>>> @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap >>>>>> int reclaimed; >>>>>> LIST_HEAD(list); >>>>>> LIST_HEAD(clean); >>>>>> - struct folio *folio; >>>>>> - struct folio *next; >>>>>> enum vm_event_item item; >>>>>> struct reclaim_stat stat; >>>>>> struct lru_gen_mm_walk *walk; >>>>>> @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap >>>>>> scanned, reclaimed, &stat, sc->priority, >>>>>> type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); >>>>>> >>>>>> - list_for_each_entry_safe_reverse(folio, next, &list, lru) { >>>>>> - if (!folio_evictable(folio)) { >>>>>> - list_del(&folio->lru); >>>>>> - folio_putback_lru(folio); >>>>>> - continue; >>>>>> - } >>>>>> - >>>>>> - if (folio_test_reclaim(folio) && >>>>>> - (folio_test_dirty(folio) || folio_test_writeback(folio))) { >>>>>> - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ >>>>>> - if (folio_test_workingset(folio)) >>>>>> - folio_set_referenced(folio); >>>>>> - continue; >>>>>> - } >>>>>> - >>>>>> - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || >>>>>> - folio_mapped(folio) || folio_test_locked(folio) || >>>>>> - folio_test_dirty(folio) || folio_test_writeback(folio)) { >>>>>> - /* don't add rejected folios to the oldest generation */ >>>>>> - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, >>>>>> - BIT(PG_active)); >>>>>> - continue; >>>>>> - } >>>>>> - >>>>>> - /* retry folios that may have missed folio_rotate_reclaimable() */ >>>>>> - list_move(&folio->lru, &clean); >>>>>> - } >>>>>> - >>>>>> + find_folios_written_back(&list, &clean, skip_retry); >>>>>> spin_lock_irq(&lruvec->lru_lock); >>>>>> >>>>>> move_folios_to_lru(lruvec, &list); >>>>>> -- >>>>>> 2.34.1 >>>>>> >>>>> >>> >>> Thanks >>> Barry