With rcu_read_lock(), the release of the PTE page table can be postponed. So we don't need to hold the pmd lock anymore when we do pte_try_get()/pte_alloc_get(), which can improve performance and simplify code logic. Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx> --- fs/proc/task_mmu.c | 8 ++++---- include/linux/pte_ref.h | 28 +++++++++++++++------------- mm/gup.c | 2 +- mm/hmm.c | 2 +- mm/khugepaged.c | 4 ++-- mm/ksm.c | 2 +- mm/madvise.c | 6 +++--- mm/memcontrol.c | 4 ++-- mm/memory.c | 14 +++++++------- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- mm/mincore.c | 2 +- mm/mprotect.c | 2 +- mm/page_vma_mapped.c | 2 +- mm/pagewalk.c | 2 +- mm/pte_ref.c | 10 +++++++++- mm/swapfile.c | 2 +- 17 files changed, 52 insertions(+), 42 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b3cf4b8a91d6..f3c9c984bc29 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -584,7 +584,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) || - !pte_try_get(vma->vm_mm, pmd)) + !pte_try_get(pmd)) goto out; /* * The mmap_lock held all the way back in m_start() is what @@ -1148,7 +1148,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, } if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) || - !pte_try_get(vma->vm_mm, pmd)) + !pte_try_get(pmd)) return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -1482,7 +1482,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, return 0; #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - if (!pte_try_get(walk->mm, pmdp)) + if (!pte_try_get(pmdp)) return 0; /* @@ -1824,7 +1824,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, if (!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) return 0; #endif - if (!pte_try_get(walk->mm, pmd)) + if (!pte_try_get(pmd)) return 0; orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h index 695fbe8b991b..f4d20faab684 100644 --- a/include/linux/pte_ref.h +++ b/include/linux/pte_ref.h @@ -74,16 +74,17 @@ static inline bool pte_get_unless_zero(pmd_t *pmdp) * i_mmap_lock or when parallel threads are excluded by other means * which can make @pmdp entry stable. */ -static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp) +static inline bool pte_try_get(pmd_t *pmdp) { bool retval = true; - spinlock_t *ptl; + pmd_t pmdval; - ptl = pmd_lock(mm, pmdp); - if (pmd_leaf(*pmdp) || !pmd_present(*pmdp) || - !pte_get_unless_zero(pmdp)) + rcu_read_lock(); + pmdval = READ_ONCE(*pmdp); + if (pmd_leaf(pmdval) || !pmd_present(pmdval) || + !pte_get_unless_zero(&pmdval)) retval = false; - spin_unlock(ptl); + rcu_read_unlock(); return retval; } @@ -129,21 +130,22 @@ static inline void pte_put_vmf(struct vm_fault *vmf) static inline int pte_alloc_try_get(struct mm_struct *mm, pmd_t *pmdp) { - if (!pte_try_get(mm, pmdp)) + if (!pte_try_get(pmdp)) return __pte_alloc_try_get(mm, pmdp); return 1; } static inline int pte_alloc_get(struct mm_struct *mm, pmd_t *pmdp) { - spinlock_t *ptl; + pmd_t pmdval; - ptl = pmd_lock(mm, pmdp); - if (pmd_none(*pmdp) || !pte_get_unless_zero(pmdp)) { - spin_unlock(ptl); + rcu_read_lock(); + pmdval = READ_ONCE(*pmdp); + if (pmd_none(pmdval) || !pte_get_unless_zero(&pmdval)) { + rcu_read_unlock(); return __pte_alloc_get(mm, pmdp); } - spin_unlock(ptl); + rcu_read_unlock(); return 0; } #else @@ -173,7 +175,7 @@ static inline bool pte_get_unless_zero(pmd_t *pmdp) return true; } -static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp) +static inline bool pte_try_get(pmd_t *pmdp) { return true; } diff --git a/mm/gup.c b/mm/gup.c index 3e2a153cb18e..a5be18e349cd 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -503,7 +503,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); - if (!pte_try_get(mm, pmd)) + if (!pte_try_get(pmd)) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); diff --git a/mm/hmm.c b/mm/hmm.c index 29bb379510cc..d0e767c5fbb6 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -380,7 +380,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); } - if (!pte_try_get(walk->mm, pmdp)) + if (!pte_try_get(pmdp)) goto again; ptep = pte_offset_map(pmdp, addr); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e6c4d1b7a12a..c653edd75345 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1240,7 +1240,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out; } - if (!pte_try_get(mm, pmd)) { + if (!pte_try_get(pmd)) { result = SCAN_PMD_NULL; goto out; } @@ -1469,7 +1469,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!pmd) goto drop_hpage; - if (!pte_try_get(mm, pmd)) + if (!pte_try_get(pmd)) goto drop_hpage; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); diff --git a/mm/ksm.c b/mm/ksm.c index 2e106f58dad0..5671683890c0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1133,7 +1133,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - if (!pte_try_get(mm, pmd)) + if (!pte_try_get(pmd)) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, diff --git a/mm/madvise.c b/mm/madvise.c index 4c4b35292212..0e849bbf268b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -193,7 +193,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_none_or_trans_huge_or_clear_bad(pmd)) || - !pte_try_get(vma->vm_mm, pmd)) + !pte_try_get(pmd)) return 0; for (index = start; index != end; index += PAGE_SIZE) { @@ -396,7 +396,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) return 0; #endif - if (!pte_try_get(vma->vm_mm, pmd)) + if (!pte_try_get(pmd)) return 0; tlb_change_page_size(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -596,7 +596,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, goto next; if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && - pmd_trans_unstable(pmd)) || !pte_try_get(mm, pmd)) + pmd_trans_unstable(pmd)) || !pte_try_get(pmd)) return 0; nr_put++; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4f19e5f2cd18..f8c1cabdd259 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5835,7 +5835,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, } if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) || - !pte_try_get(vma->vm_mm, pmd)) + !pte_try_get(pmd)) return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) @@ -6058,7 +6058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, if (!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) return 0; retry: - if (!pte_try_get(vma->vm_mm, pmd)) + if (!pte_try_get(pmd)) return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { diff --git a/mm/memory.c b/mm/memory.c index 242ed135bde4..c8ee0074c730 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1143,7 +1143,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, if (pmd_none_or_clear_bad(src_pmd)) continue; - if (!pte_try_get(src_mm, src_pmd)) + if (!pte_try_get(src_pmd)) goto retry; if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, addr, next)) { @@ -1481,7 +1481,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, */ if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_none_or_trans_huge_or_clear_bad(pmd)) || - !pte_try_get(tlb->mm, pmd)) + !pte_try_get(pmd)) goto next; next = zap_pte_range(tlb, vma, pmd, addr, next, details); @@ -2608,7 +2608,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, continue; pmd_clear_bad(pmd); } - if (!create && !pte_try_get(mm, pmd)) + if (!create && !pte_try_get(pmd)) goto retry; err = apply_to_pte_range(mm, pmd, addr, next, fn, data, create, mask); @@ -4078,7 +4078,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) } } else if (pmd_devmap_trans_unstable(vmf->pmd)) { /* See comment in handle_pte_fault() */ return 0; - } else if (!pte_try_get(vma->vm_mm, vmf->pmd)) { + } else if (!pte_try_get(vmf->pmd)) { goto retry; } @@ -4319,7 +4319,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; goto out; } else { - if (!pte_try_get(vma->vm_mm, vmf->pmd)) { + if (!pte_try_get(vmf->pmd)) { ret = VM_FAULT_SIGBUS; goto out; } @@ -4579,7 +4579,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; - if (!pte_try_get(vmf->vma->vm_mm, vmf->pmd)) + if (!pte_try_get(vmf->pmd)) goto retry; if (IS_ENABLED(CONFIG_FREE_USER_PTE)) @@ -5000,7 +5000,7 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, (address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(range); } - if (!pte_try_get(mm, pmd)) + if (!pte_try_get(pmd)) goto out; ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!pte_present(*ptep)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index cbb3640717ff..b19243d8fe56 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -520,7 +520,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, /* THP was split, fall through to pte walk */ if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) || - !pte_try_get(walk->mm, pmd)) + !pte_try_get(pmd)) return 0; mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); diff --git a/mm/migrate.c b/mm/migrate.c index 6a94e8558b2c..e1a2169ab9e9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2265,7 +2265,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, if (unlikely(pmd_bad(*pmdp))) return migrate_vma_collect_skip(start, end, walk); - if (!pte_try_get(mm, pmdp)) + if (!pte_try_get(pmdp)) goto again; ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); arch_enter_lazy_mmu_mode(); diff --git a/mm/mincore.c b/mm/mincore.c index e21e271a7657..76eb31aaeef9 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -115,7 +115,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) || - !pte_try_get(walk->mm, pmd)) { + !pte_try_get(pmd)) { __mincore_unmapped_range(addr, end, vma, vec); goto out; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 9cbd0848c5c5..8b387f8386c4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -319,7 +319,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, } /* fall through, the trans huge pmd just split */ } - if (!pte_try_get(vma->vm_mm, pmd)) + if (!pte_try_get(pmd)) goto retry; this_pages = change_pte_range(vma, pmd, addr, next, newprot, cp_flags); diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index eb84fa5825c0..c49bbff7aa60 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -259,7 +259,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) step_forward(pvmw, PMD_SIZE); continue; } - if (!pte_try_get(pvmw->vma->vm_mm, pvmw->pmd)) + if (!pte_try_get(pvmw->pmd)) goto retry; if (!map_pte(pvmw)) goto next_pte; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 4080a88d7852..c7439a2e85f7 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -152,7 +152,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT); } else { if (!walk->no_vma) { - if (!pte_try_get(walk->mm, pmd)) + if (!pte_try_get(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); pte_put(walk->mm, pmd, addr); diff --git a/mm/pte_ref.c b/mm/pte_ref.c index 1b8d9828d513..7fd3d687a9cd 100644 --- a/mm/pte_ref.c +++ b/mm/pte_ref.c @@ -26,6 +26,14 @@ static inline void pte_free_debug(pmd_t pmd) } #endif +static void pte_free_rcu(struct rcu_head *rcu) +{ + struct page *page = container_of(rcu, struct page, rcu_head); + + pgtable_pte_page_dtor(page); + __free_page(page); +} + void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr) { struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); @@ -39,7 +47,7 @@ void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr) pte_free_debug(pmd); flush_tlb_range(&vma, addr, addr + PMD_SIZE); mm_dec_nr_ptes(mm); - pte_free(mm, pmd_pgtable(pmd)); + call_rcu(&pmd_pgtable(pmd)->rcu_head, pte_free_rcu); } static inline void __pte_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) diff --git a/mm/swapfile.c b/mm/swapfile.c index 6153283be500..fe6f7c6d2849 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2024,7 +2024,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_none_or_trans_huge_or_clear_bad(pmd)) || - !pte_try_get(vma->vm_mm, pmd)) + !pte_try_get(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, type, frontswap, fs_pages_to_unuse); -- 2.11.0