On Thu, Jul 13, 2023 at 9:06 AM Yin Fengwei <fengwei.yin@xxxxxxxxx> wrote: > > Current madvise_cold_or_pageout_pte_range() has two problems for > large folio support: > - Using folio_mapcount() with large folio prevent large folio from > picking up. > - If large folio is in the range requested, shouldn't split it > in madvise_cold_or_pageout_pte_range(). > > Fix them by: > - Use folio_estimated_sharers() with large folio > - If large folio is in the range requested, don't split it. Leave > to page reclaim phase. > > For large folio cross boundaries of requested range, skip it if it's > page cache. Try to split it if it's anonymous folio. If splitting > fails, skip it. > > The main reason to call folio_referenced() is to clear the yong of > conresponding PTEs. So in page reclaim phase, there is good chance > the folio can be reclaimed. > > Signed-off-by: Yin Fengwei <fengwei.yin@xxxxxxxxx> > --- > This patch is based on mlock large folio support rfc2 as it depends > on the folio_in_range() added by that patchset > > Also folio_op_size() can be unitfied with get_folio_mlock_step(). > > Testing done: > - kselftest: No new regression introduced. > > mm/madvise.c | 133 ++++++++++++++++++++++++++++++++------------------- > 1 file changed, 84 insertions(+), 49 deletions(-) > > diff --git a/mm/madvise.c b/mm/madvise.c > index 38382a5d1e393..5748cf098235d 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -31,6 +31,7 @@ > #include <linux/swapops.h> > #include <linux/shmem_fs.h> > #include <linux/mmu_notifier.h> > +#include <linux/kernel.h> > > #include <asm/tlb.h> > > @@ -339,6 +340,35 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) > file_permission(vma->vm_file, MAY_WRITE) == 0; > } > > +static inline bool skip_current_entry(struct folio *folio, bool pageout_anon) > +{ > + if (!folio) > + return true; > + > + if (folio_is_zone_device(folio)) > + return true; > + > + if (!folio_test_lru(folio)) > + return true; > + > + if (pageout_anon && !folio_test_anon(folio)) > + return true; > + > + if (folio_test_unevictable(folio)) > + return true; > + > + return false; > +} > + > +static inline unsigned int folio_op_size(struct folio *folio, pte_t pte, > + unsigned long addr, unsigned long end) > +{ > + unsigned int nr; > + > + nr = folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte); > + return min_t(unsigned int, nr, (end - addr) >> PAGE_SHIFT); > +} > + > static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, > unsigned long addr, unsigned long end, > struct mm_walk *walk) > @@ -353,6 +383,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, > struct folio *folio = NULL; > LIST_HEAD(folio_list); > bool pageout_anon_only_filter; > + unsigned long start = addr; > > if (fatal_signal_pending(current)) > return -EINTR; > @@ -383,7 +414,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, > folio = pfn_folio(pmd_pfn(orig_pmd)); > > /* Do not interfere with other mappings of this folio */ > - if (folio_mapcount(folio) != 1) > + if (folio_estimated_sharers(folio) != 1) > goto huge_unlock; > > if (pageout_anon_only_filter && !folio_test_anon(folio)) > @@ -442,78 +473,60 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, > for (; addr < end; pte++, addr += PAGE_SIZE) { > ptent = ptep_get(pte); > > - if (pte_none(ptent)) > - continue; > - > - if (!pte_present(ptent)) > + if (pte_none(ptent) || !pte_present(ptent)) > continue; > > folio = vm_normal_folio(vma, addr, ptent); > - if (!folio || folio_is_zone_device(folio)) > + if (skip_current_entry(folio, pageout_anon_only_filter)) > continue; > > /* > - * Creating a THP page is expensive so split it only if we > - * are sure it's worth. Split it if we are only owner. > + * Split large folio if it's anonymous and cross the > + * boundaries of request range. > */ > if (folio_test_large(folio)) { > - int err; > + int err, step; > + > + if (folio_estimated_sharers(folio) != 1) > + continue; > + > + if (folio_in_range(folio, vma, start, end)) > + goto pageout_cold_folio; > > - if (folio_mapcount(folio) != 1) > - break; > - if (pageout_anon_only_filter && !folio_test_anon(folio)) > - break; > - if (!folio_trylock(folio)) > - break; > folio_get(folio); > + step = folio_op_size(folio, ptent, addr, end); > + if (!folio_test_anon(folio) || !folio_trylock(folio)) { > + folio_put(folio); > + goto next_folio; > + } > + > arch_leave_lazy_mmu_mode(); > pte_unmap_unlock(start_pte, ptl); > start_pte = NULL; > err = split_folio(folio); > folio_unlock(folio); > folio_put(folio); > - if (err) > - break; > + > start_pte = pte = > pte_offset_map_lock(mm, pmd, addr, &ptl); > if (!start_pte) > break; > arch_enter_lazy_mmu_mode(); > - pte--; > - addr -= PAGE_SIZE; > - continue; > - } > > - /* > - * Do not interfere with other mappings of this folio and > - * non-LRU folio. > - */ > - if (!folio_test_lru(folio) || folio_mapcount(folio) != 1) > + /* Skip the folio if split fails */ > + if (!err) > + step = 0; > +next_folio: > + pte += step - 1; > + addr += (step - 1) << PAGE_SHIFT; > continue; > + } > > - if (pageout_anon_only_filter && !folio_test_anon(folio)) > + /* Do not interfere with other mappings of this folio */ > + if (folio_mapcount(folio) != 1) > continue; > > - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); > - > - if (pte_young(ptent)) { > - ptent = ptep_get_and_clear_full(mm, addr, pte, > - tlb->fullmm); > - ptent = pte_mkold(ptent); > - set_pte_at(mm, addr, pte, ptent); > - tlb_remove_tlb_entry(tlb, pte, addr); > - } > - > - /* > - * We are deactivating a folio for accelerating reclaiming. > - * VM couldn't reclaim the folio unless we clear PG_young. > - * As a side effect, it makes confuse idle-page tracking > - * because they will miss recent referenced history. > - */ > - folio_clear_referenced(folio); > - folio_test_clear_young(folio); > - if (folio_test_active(folio)) > - folio_set_workingset(folio); > +pageout_cold_folio: > if (pageout) { > if (folio_isolate_lru(folio)) { > if (folio_test_unevictable(folio)) > @@ -529,8 +542,30 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, > arch_leave_lazy_mmu_mode(); > pte_unmap_unlock(start_pte, ptl); > } > - if (pageout) > - reclaim_pages(&folio_list); > + > + if (pageout) { > + LIST_HEAD(reclaim_list); > + > + while (!list_empty(&folio_list)) { > + int refs; > + unsigned long flags; > + struct mem_cgroup *memcg = folio_memcg(folio); > + > + folio = lru_to_folio(&folio_list); > + list_del(&folio->lru); > + > + refs = folio_referenced(folio, 0, memcg, &flags); > + > + if ((flags & VM_LOCKED) || (refs == -1)) { > + folio_putback_lru(folio); > + continue; > + } > + > + folio_test_clear_referenced(folio); > + list_add(&folio->lru, &reclaim_list); > + } > + reclaim_pages(&reclaim_list); > + } i overlooked the chunk above -- it's unnecessary: after we split the large folio (and splice the base folios onto the same LRU list), we continue at the position of the first base folio because of: pte--; addr -= PAGE_SIZE; continue; And then we do pte_mkold(), which takes care of the A-bit.