On Tue, Dec 10, 2024 at 2:41 PM chenridong <chenridong@xxxxxxxxxx> wrote: > > > > On 2024/12/10 12:54, Barry Song wrote: > > On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@xxxxxxxxxxxxxxx> wrote: > >> > >> From: Chen Ridong <chenridong@xxxxxxxxxx> > >> > >> The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back > >> while isolated") only fixed the issue for mglru. However, this issue > >> also exists in the traditional active/inactive LRU. This issue will be > >> worse if THP is split, which makes the list longer and needs longer time > >> to finish a batch of folios reclaim. > >> > >> This issue should be fixed in the same way for the traditional LRU. > >> Therefore, the common logic was extracted to the 'find_folios_written_back' > >> function firstly, which is then reused in the 'shrink_inactive_list' > >> function. Finally, retry reclaiming those folios that may have missed the > >> rotation for traditional LRU. > > > > let's drop the cover-letter and refine the changelog. > > > Will update. > > >> > >> Signed-off-by: Chen Ridong <chenridong@xxxxxxxxxx> > >> --- > >> include/linux/mmzone.h | 3 +- > >> mm/vmscan.c | 108 +++++++++++++++++++++++++++++------------ > >> 2 files changed, 77 insertions(+), 34 deletions(-) > >> > >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > >> index b36124145a16..47c6e8c43dcd 100644 > >> --- a/include/linux/mmzone.h > >> +++ b/include/linux/mmzone.h > >> @@ -391,6 +391,7 @@ struct page_vma_mapped_walk; > >> > >> #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) > >> #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) > >> +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) > >> > >> #ifdef CONFIG_LRU_GEN > >> > >> @@ -406,8 +407,6 @@ enum { > >> NR_LRU_GEN_CAPS > >> }; > >> > >> -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) > >> - > >> #define MIN_LRU_BATCH BITS_PER_LONG > >> #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) > >> > >> diff --git a/mm/vmscan.c b/mm/vmscan.c > >> index 76378bc257e3..1f0d194f8b2f 100644 > >> --- a/mm/vmscan.c > >> +++ b/mm/vmscan.c > >> @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task, > >> task->reclaim_state = rs; > >> } > >> > >> +/** > >> + * find_folios_written_back - Find and move the written back folios to a new list. > >> + * @list: filios list > >> + * @clean: the written back folios list > >> + * @skip: whether skip to move the written back folios to clean list. > >> + */ > >> +static inline void find_folios_written_back(struct list_head *list, > >> + struct list_head *clean, bool skip) > >> +{ > >> + struct folio *folio; > >> + struct folio *next; > >> + > >> + list_for_each_entry_safe_reverse(folio, next, list, lru) { > >> + if (!folio_evictable(folio)) { > >> + list_del(&folio->lru); > >> + folio_putback_lru(folio); > >> + continue; > >> + } > >> + > >> + if (folio_test_reclaim(folio) && > >> + (folio_test_dirty(folio) || folio_test_writeback(folio))) { > >> + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ > >> + if (lru_gen_enabled() && folio_test_workingset(folio)) > >> + folio_set_referenced(folio); > >> + continue; > >> + } > >> + > >> + if (skip || folio_test_active(folio) || folio_test_referenced(folio) || > >> + folio_mapped(folio) || folio_test_locked(folio) || > >> + folio_test_dirty(folio) || folio_test_writeback(folio)) { > >> + /* don't add rejected folios to the oldest generation */ > >> + if (lru_gen_enabled()) > >> + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, > >> + BIT(PG_active)); > >> + continue; > >> + } > >> + > >> + /* retry folios that may have missed folio_rotate_reclaimable() */ > >> + list_move(&folio->lru, clean); > >> + } > >> +} > >> + > >> /* > >> * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to > >> * scan_control->nr_reclaimed. > >> @@ -1907,6 +1949,25 @@ static int current_may_throttle(void) > >> return !(current->flags & PF_LOCAL_THROTTLE); > >> } > >> > >> +static inline void acc_reclaimed_stat(struct reclaim_stat *stat, > >> + struct reclaim_stat *curr) > >> +{ > >> + int i; > >> + > >> + stat->nr_dirty += curr->nr_dirty; > >> + stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; > >> + stat->nr_congested += curr->nr_congested; > >> + stat->nr_writeback += curr->nr_writeback; > >> + stat->nr_immediate += curr->nr_immediate; > >> + stat->nr_pageout += curr->nr_pageout; > >> + stat->nr_ref_keep += curr->nr_ref_keep; > >> + stat->nr_unmap_fail += curr->nr_unmap_fail; > >> + stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; > >> + stat->nr_demoted += curr->nr_demoted; > >> + for (i = 0; i < ANON_AND_FILE; i++) > >> + stat->nr_activate[i] = curr->nr_activate[i]; > >> +} > > > > you had no this before, what's the purpose of this? > > > > We may call shrink_folio_list twice, and the 'stat curr' will reset in > the shrink_folio_list function. We should accumulate the stats as a > whole, which will then be used to calculate the cost and return it to > the caller. Does mglru have the same issue? If so, we may need to send a patch to fix mglru's stat accounting as well. By the way, the code is rather messy—could it be implemented as shown below instead? diff --git a/mm/vmscan.c b/mm/vmscan.c index 1f0d194f8b2f..40d2ddde21f5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1094,7 +1094,6 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, struct swap_iocb *plug = NULL; folio_batch_init(&free_folios); - memset(stat, 0, sizeof(*stat)); cond_resched(); do_demote_pass = can_demote(pgdat->node_id, sc); @@ -1949,25 +1948,6 @@ static int current_may_throttle(void) return !(current->flags & PF_LOCAL_THROTTLE); } -static inline void acc_reclaimed_stat(struct reclaim_stat *stat, - struct reclaim_stat *curr) -{ - int i; - - stat->nr_dirty += curr->nr_dirty; - stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; - stat->nr_congested += curr->nr_congested; - stat->nr_writeback += curr->nr_writeback; - stat->nr_immediate += curr->nr_immediate; - stat->nr_pageout += curr->nr_pageout; - stat->nr_ref_keep += curr->nr_ref_keep; - stat->nr_unmap_fail += curr->nr_unmap_fail; - stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; - stat->nr_demoted += curr->nr_demoted; - for (i = 0; i < ANON_AND_FILE; i++) - stat->nr_activate[i] = curr->nr_activate[i]; -} - /* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages @@ -1981,7 +1961,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, unsigned long nr_scanned; unsigned int nr_reclaimed = 0; unsigned long nr_taken; - struct reclaim_stat stat, curr; + struct reclaim_stat stat; bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -2022,9 +2002,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, memset(&stat, 0, sizeof(stat)); retry: - nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &stat, false); find_folios_written_back(&folio_list, &clean_list, skip_retry); - acc_reclaimed_stat(&stat, &curr); spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); > > Thanks, > Ridong > > >> + > >> /* > >> * shrink_inactive_list() is a helper for shrink_node(). It returns the number > >> * of reclaimed pages > >> @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, > >> enum lru_list lru) > >> { > >> LIST_HEAD(folio_list); > >> + LIST_HEAD(clean_list); > >> unsigned long nr_scanned; > >> unsigned int nr_reclaimed = 0; > >> unsigned long nr_taken; > >> - struct reclaim_stat stat; > >> + struct reclaim_stat stat, curr; > >> bool file = is_file_lru(lru); > >> enum vm_event_item item; > >> struct pglist_data *pgdat = lruvec_pgdat(lruvec); > >> bool stalled = false; > >> + bool skip_retry = false; > >> > >> while (unlikely(too_many_isolated(pgdat, file, sc))) { > >> if (stalled) > >> @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, > >> if (nr_taken == 0) > >> return 0; > >> > >> - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); > >> + memset(&stat, 0, sizeof(stat)); > >> +retry: > >> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); > >> + find_folios_written_back(&folio_list, &clean_list, skip_retry); > >> + acc_reclaimed_stat(&stat, &curr); > >> > >> spin_lock_irq(&lruvec->lru_lock); > >> move_folios_to_lru(lruvec, &folio_list); > >> + if (!list_empty(&clean_list)) { > >> + list_splice_init(&clean_list, &folio_list); > >> + skip_retry = true; > >> + spin_unlock_irq(&lruvec->lru_lock); > >> + goto retry; This is rather confusing. We're still jumping to retry even though skip_retry=true is set. Can we find a clearer approach for this? It was somewhat acceptable before we introduced the extracted function find_folios_written_back(). However, it has become harder to follow now that skip_retry is passed across functions. I find renaming skip_retry to is_retry more intuitive. The logic is that since we are already retrying, find_folios_written_back() shouldn’t move folios to the clean list again. The intended semantics are: we have retris, don’t retry again. > >> + } > >> > >> __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), > >> stat.nr_demoted); > >> @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap > >> int reclaimed; > >> LIST_HEAD(list); > >> LIST_HEAD(clean); > >> - struct folio *folio; > >> - struct folio *next; > >> enum vm_event_item item; > >> struct reclaim_stat stat; > >> struct lru_gen_mm_walk *walk; > >> @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap > >> scanned, reclaimed, &stat, sc->priority, > >> type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); > >> > >> - list_for_each_entry_safe_reverse(folio, next, &list, lru) { > >> - if (!folio_evictable(folio)) { > >> - list_del(&folio->lru); > >> - folio_putback_lru(folio); > >> - continue; > >> - } > >> - > >> - if (folio_test_reclaim(folio) && > >> - (folio_test_dirty(folio) || folio_test_writeback(folio))) { > >> - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ > >> - if (folio_test_workingset(folio)) > >> - folio_set_referenced(folio); > >> - continue; > >> - } > >> - > >> - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || > >> - folio_mapped(folio) || folio_test_locked(folio) || > >> - folio_test_dirty(folio) || folio_test_writeback(folio)) { > >> - /* don't add rejected folios to the oldest generation */ > >> - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, > >> - BIT(PG_active)); > >> - continue; > >> - } > >> - > >> - /* retry folios that may have missed folio_rotate_reclaimable() */ > >> - list_move(&folio->lru, &clean); > >> - } > >> - > >> + find_folios_written_back(&list, &clean, skip_retry); > >> spin_lock_irq(&lruvec->lru_lock); > >> > >> move_folios_to_lru(lruvec, &list); > >> -- > >> 2.34.1 > >> > > Thanks Barry