The patch titled Subject: mm: don't be stuck to rmap lock on reclaim path has been added to the -mm mm-unstable branch. Its filename is mm-dont-be-stuck-to-rmap-lock-on-reclaim-path.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-dont-be-stuck-to-rmap-lock-on-reclaim-path.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Minchan Kim <minchan@xxxxxxxxxx> Subject: mm: don't be stuck to rmap lock on reclaim path The rmap locks(i_mmap_rwsem and anon_vma->root->rwsem) could be contended under memory pressure if processes keep working on their vmas(e.g., fork, mmap, munmap). It makes reclaim path stuck. In our real workload traces, we see kswapd is waiting the lock for 300ms+(worst case, a sec) and it makes other processes entering direct reclaim, which were also stuck on the lock. This patch makes lru aging path try_lock mode like shink_page_list so the reclaim context will keep working with next lru pages without being stuck. if it found the rmap lock contended, it rotates the page back to head of lru in both active/inactive lrus to make them consistent behavior, which is basic starting point rather than adding more heristic. Since this patch introduces a new "contended" field as out-param along with try_lock in-param in rmap_walk_control, it's not immutable any longer if the try_lock is set so remove const keywords on rmap related functions. Since rmap walking is already expensive operation, I doubt the const would help sizable benefit( And we didn't have it until 5.17). In a heavy app workload in Android, trace shows following statistics. It almost removes rmap lock contention from reclaim path. >From Martin Liu Before: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function 1632 0 1631 151.542173 31672 209 page_lock_anon_vma_read 601 0 601 145.544681 28817 198 rmap_walk_file After: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function NaN NaN NaN NaN NaN 0.0 NaN 0 0 0 0.127645 1 12 rmap_walk_file Link: https://lkml.kernel.org/r/20220510215423.164547-1-minchan@xxxxxxxxxx Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx> Acked-by: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Suren Baghdasaryan <surenb@xxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: John Dias <joaodias@xxxxxxxxxx> Cc: Tim Murray <timmurray@xxxxxxxxxx> Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx> Cc: Martin Liu <liumartin@xxxxxxxxxx> Cc: Minchan Kim <minchan@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- include/linux/fs.h | 5 +++ include/linux/ksm.h | 4 +-- include/linux/rmap.h | 28 +++++++++++++++------- mm/ksm.c | 10 ++++++- mm/memory-failure.c | 2 - mm/page_idle.c | 7 +---- mm/rmap.c | 52 ++++++++++++++++++++++++++++++++--------- mm/vmscan.c | 7 ++++- 8 files changed, 85 insertions(+), 30 deletions(-) --- a/include/linux/fs.h~mm-dont-be-stuck-to-rmap-lock-on-reclaim-path +++ a/include/linux/fs.h @@ -477,6 +477,11 @@ static inline void i_mmap_unlock_write(s up_write(&mapping->i_mmap_rwsem); } +static inline int i_mmap_trylock_read(struct address_space *mapping) +{ + return down_read_trylock(&mapping->i_mmap_rwsem); +} + static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); --- a/include/linux/ksm.h~mm-dont-be-stuck-to-rmap-lock-on-reclaim-path +++ a/include/linux/ksm.h @@ -51,7 +51,7 @@ static inline void ksm_exit(struct mm_st struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); -void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc); +void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); #else /* !CONFIG_KSM */ @@ -79,7 +79,7 @@ static inline struct page *ksm_might_nee } static inline void rmap_walk_ksm(struct folio *folio, - const struct rmap_walk_control *rwc) + struct rmap_walk_control *rwc) { } --- a/include/linux/rmap.h~mm-dont-be-stuck-to-rmap-lock-on-reclaim-path +++ a/include/linux/rmap.h @@ -128,6 +128,11 @@ static inline void anon_vma_lock_read(st down_read(&anon_vma->root->rwsem); } +static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) +{ + return down_read_trylock(&anon_vma->root->rwsem); +} + static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) { up_read(&anon_vma->root->rwsem); @@ -366,17 +371,14 @@ int pfn_mkclean_range(unsigned long pfn, void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); -/* - * Called by memory-failure.c to kill processes. - */ -struct anon_vma *folio_lock_anon_vma_read(struct folio *folio); -void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* * rmap_walk_control: To control rmap traversing for specific needs * * arg: passed to rmap_one() and invalid_vma() + * try_lock: bail out if the rmap lock is contended + * contended: indicate the rmap traversal bailed out due to lock contention * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition * anon_lock: for getting anon_lock by optimized way rather than default @@ -384,6 +386,8 @@ int page_mapped_in_vma(struct page *page */ struct rmap_walk_control { void *arg; + bool try_lock; + bool contended; /* * Return false if page table scanning in rmap_walk should be stopped. * Otherwise, return true. @@ -391,12 +395,20 @@ struct rmap_walk_control { bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct folio *folio); - struct anon_vma *(*anon_lock)(struct folio *folio); + struct anon_vma *(*anon_lock)(struct folio *folio, + struct rmap_walk_control *rwc); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; -void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc); -void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc); +void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); +void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); + +/* + * Called by memory-failure.c to kill processes. + */ +struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, + struct rmap_walk_control *rwc); +void page_unlock_anon_vma_read(struct anon_vma *anon_vma); #else /* !CONFIG_MMU */ --- a/mm/ksm.c~mm-dont-be-stuck-to-rmap-lock-on-reclaim-path +++ a/mm/ksm.c @@ -2614,7 +2614,7 @@ struct page *ksm_might_need_to_copy(stru return new_page; } -void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc) +void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; @@ -2638,7 +2638,13 @@ again: struct vm_area_struct *vma; cond_resched(); - anon_vma_lock_read(anon_vma); + if (!anon_vma_trylock_read(anon_vma)) { + if (rwc->try_lock) { + rwc->contended = true; + return; + } + anon_vma_lock_read(anon_vma); + } anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { unsigned long addr; --- a/mm/memory-failure.c~mm-dont-be-stuck-to-rmap-lock-on-reclaim-path +++ a/mm/memory-failure.c @@ -485,7 +485,7 @@ static void collect_procs_anon(struct pa struct anon_vma *av; pgoff_t pgoff; - av = folio_lock_anon_vma_read(folio); + av = folio_lock_anon_vma_read(folio, NULL); if (av == NULL) /* Not actually mapped anymore */ return; --- a/mm/page_idle.c~mm-dont-be-stuck-to-rmap-lock-on-reclaim-path +++ a/mm/page_idle.c @@ -86,11 +86,8 @@ static bool page_idle_clear_pte_refs_one static void page_idle_clear_pte_refs(struct page *page) { struct folio *folio = page_folio(page); - /* - * Since rwc.arg is unused, rwc is effectively immutable, so we - * can make it static const to save some cycles and stack. - */ - static const struct rmap_walk_control rwc = { + + static struct rmap_walk_control rwc = { .rmap_one = page_idle_clear_pte_refs_one, .anon_lock = folio_lock_anon_vma_read, }; --- a/mm/rmap.c~mm-dont-be-stuck-to-rmap-lock-on-reclaim-path +++ a/mm/rmap.c @@ -527,9 +527,11 @@ out: * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a - * reference like with page_get_anon_vma() and then block on the mutex. + * reference like with page_get_anon_vma() and then block on the mutex + * on !rwc->try_lock case. */ -struct anon_vma *folio_lock_anon_vma_read(struct folio *folio) +struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; @@ -557,6 +559,12 @@ struct anon_vma *folio_lock_anon_vma_rea goto out; } + if (rwc && rwc->try_lock) { + anon_vma = NULL; + rwc->contended = true; + goto out; + } + /* trylock failed, we got to sleep */ if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; @@ -883,7 +891,8 @@ static bool invalid_folio_referenced_vma * * Quick test_and_clear_referenced for all mappings of a folio, * - * Return: The number of mappings which referenced the folio. + * Return: The number of mappings which referenced the folio. Return -1 if + * the function bailed out due to rmap lock contention. */ int folio_referenced(struct folio *folio, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags) @@ -897,6 +906,7 @@ int folio_referenced(struct folio *folio .rmap_one = folio_referenced_one, .arg = (void *)&pra, .anon_lock = folio_lock_anon_vma_read, + .try_lock = true, }; *vm_flags = 0; @@ -927,7 +937,7 @@ int folio_referenced(struct folio *folio if (we_locked) folio_unlock(folio); - return pra.referenced; + return rwc.contended ? -1 : pra.referenced; } static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) @@ -2336,12 +2346,12 @@ void __put_anon_vma(struct anon_vma *ano } static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, - const struct rmap_walk_control *rwc) + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; if (rwc->anon_lock) - return rwc->anon_lock(folio); + return rwc->anon_lock(folio, rwc); /* * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() @@ -2353,7 +2363,17 @@ static struct anon_vma *rmap_walk_anon_l if (!anon_vma) return NULL; + if (anon_vma_trylock_read(anon_vma)) + goto out; + + if (rwc->try_lock) { + anon_vma = NULL; + rwc->contended = true; + goto out; + } + anon_vma_lock_read(anon_vma); +out: return anon_vma; } @@ -2367,7 +2387,7 @@ static struct anon_vma *rmap_walk_anon_l * contained in the anon_vma struct it points to. */ static void rmap_walk_anon(struct folio *folio, - const struct rmap_walk_control *rwc, bool locked) + struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; @@ -2415,7 +2435,7 @@ static void rmap_walk_anon(struct folio * contained in the address_space struct it points to. */ static void rmap_walk_file(struct folio *folio, - const struct rmap_walk_control *rwc, bool locked) + struct rmap_walk_control *rwc, bool locked) { struct address_space *mapping = folio_mapping(folio); pgoff_t pgoff_start, pgoff_end; @@ -2434,8 +2454,18 @@ static void rmap_walk_file(struct folio pgoff_start = folio_pgoff(folio); pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; - if (!locked) + if (!locked) { + if (i_mmap_trylock_read(mapping)) + goto lookup; + + if (rwc->try_lock) { + rwc->contended = true; + return; + } + i_mmap_lock_read(mapping); + } +lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { unsigned long address = vma_address(&folio->page, vma); @@ -2457,7 +2487,7 @@ done: i_mmap_unlock_read(mapping); } -void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc) +void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { if (unlikely(folio_test_ksm(folio))) rmap_walk_ksm(folio, rwc); @@ -2468,7 +2498,7 @@ void rmap_walk(struct folio *folio, cons } /* Like rmap_walk, but caller holds relevant rmap lock */ -void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc) +void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) { /* no ksm support for now */ VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); --- a/mm/vmscan.c~mm-dont-be-stuck-to-rmap-lock-on-reclaim-path +++ a/mm/vmscan.c @@ -1391,6 +1391,10 @@ static enum page_references folio_check_ if (vm_flags & VM_LOCKED) return PAGEREF_ACTIVATE; + /* rmap lock contention: rotate */ + if (referenced_ptes == -1) + return PAGEREF_KEEP; + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -2499,8 +2503,9 @@ static void shrink_active_list(unsigned } } + /* Referenced or rmap lock contention: rotate */ if (folio_referenced(folio, 0, sc->target_mem_cgroup, - &vm_flags)) { + &vm_flags) != 0) { /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So _ Patches currently in -mm which might be from minchan@xxxxxxxxxx are mm-fix-is_pinnable_page-against-on-cma-page.patch mm-dont-be-stuck-to-rmap-lock-on-reclaim-path.patch