With rcu_read_lock(), the release of the PTE page table can be postponed. So we don't need to hold the pmd lock anymore when we do pte_try_get()/pte_alloc_get(), which can improve performance and simplify code logic. Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx> --- fs/proc/task_mmu.c | 8 ++++---- include/linux/pte_ref.h | 34 ++++++++++++++++++---------------- mm/gup.c | 2 +- mm/hmm.c | 2 +- mm/khugepaged.c | 4 ++-- mm/ksm.c | 2 +- mm/madvise.c | 6 +++--- mm/memcontrol.c | 4 ++-- mm/memory.c | 14 +++++++------- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- mm/mincore.c | 2 +- mm/mprotect.c | 2 +- mm/page_vma_mapped.c | 2 +- mm/pagewalk.c | 2 +- mm/pte_ref.c | 10 +++++++++- mm/swapfile.c | 2 +- 17 files changed, 55 insertions(+), 45 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b31915696210..f44caef03f22 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -583,7 +583,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, goto out; } - if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd)) + if (pmd_trans_unstable_or_pte_try_get(pmd)) goto out; /* * The mmap_lock held all the way back in m_start() is what @@ -1146,7 +1146,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } - if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd)) + if (pmd_trans_unstable_or_pte_try_get(pmd)) return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -1477,7 +1477,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - if (pmd_trans_unstable_or_pte_try_get(walk->mm, pmdp)) + if (pmd_trans_unstable_or_pte_try_get(pmdp)) return 0; /* @@ -1816,7 +1816,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, return 0; } #endif - if (pmd_trans_unstable_or_pte_try_get(walk->mm, pmd)) + if (pmd_trans_unstable_or_pte_try_get(pmd)) return 0; orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h index 04cca9427270..259e5aec048d 100644 --- a/include/linux/pte_ref.h +++ b/include/linux/pte_ref.h @@ -90,16 +90,17 @@ static inline bool pte_get_unless_zero(pmd_t *pmdp) * Before Operating the PTE page table, we need to hold a ->pte_refcount * to protect against the concurrent release of the PTE page table. */ -static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp) +static inline bool pte_try_get(pmd_t *pmdp) { bool retval = true; - spinlock_t *ptl; + pmd_t pmdval; - ptl = pmd_lock(mm, pmdp); - if (pmd_leaf(*pmdp) || !pmd_present(*pmdp) || - !pte_get_unless_zero(pmdp)) + rcu_read_lock(); + pmdval = READ_ONCE(*pmdp); + if (pmd_leaf(pmdval) || !pmd_present(pmdval) || + !pte_get_unless_zero(&pmdval)) retval = false; - spin_unlock(ptl); + rcu_read_unlock(); return retval; } @@ -159,14 +160,15 @@ static inline void pte_put_vmf(struct vm_fault *vmf) */ static inline int pte_alloc_get(struct mm_struct *mm, pmd_t *pmdp) { - spinlock_t *ptl; + pmd_t pmdval; - ptl = pmd_lock(mm, pmdp); - if (pmd_none(*pmdp) || !pte_get_unless_zero(pmdp)) { - spin_unlock(ptl); + rcu_read_lock(); + pmdval = READ_ONCE(*pmdp); + if (pmd_none(pmdval) || !pte_get_unless_zero(&pmdval)) { + rcu_read_unlock(); return __pte_alloc_get(mm, pmdp); } - spin_unlock(ptl); + rcu_read_unlock(); return 0; } @@ -189,14 +191,14 @@ static inline int pte_alloc_get(struct mm_struct *mm, pmd_t *pmdp) */ static inline int pte_alloc_try_get(struct mm_struct *mm, pmd_t *pmdp) { - if (!pte_try_get(mm, pmdp)) + if (!pte_try_get(pmdp)) return __pte_alloc_try_get(mm, pmdp); return 1; } -static inline bool pmd_trans_unstable_or_pte_try_get(struct mm_struct *mm, pmd_t *pmdp) +static inline bool pmd_trans_unstable_or_pte_try_get(pmd_t *pmdp) { - if (!pte_try_get(mm, pmdp)) + if (!pte_try_get(pmdp)) return true; return false; } @@ -227,7 +229,7 @@ static inline bool pte_get_unless_zero(pmd_t *pmdp) return true; } -static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp) +static inline bool pte_try_get(pmd_t *pmdp) { return true; } @@ -265,7 +267,7 @@ static inline int pte_alloc_try_get(struct mm_struct *mm, pmd_t *pmdp) return 1; } -static inline bool pmd_trans_unstable_or_pte_try_get(struct mm_struct *mm, pmd_t *pmdp) +static inline bool pmd_trans_unstable_or_pte_try_get(pmd_t *pmdp) { if (pmd_trans_unstable(pmdp)) return true; diff --git a/mm/gup.c b/mm/gup.c index 30757f3b176c..c987ac45d939 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -500,7 +500,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); - if (!pte_try_get(mm, pmd)) + if (!pte_try_get(pmd)) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); diff --git a/mm/hmm.c b/mm/hmm.c index 29bb379510cc..d0e767c5fbb6 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -380,7 +380,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); } - if (!pte_try_get(walk->mm, pmdp)) + if (!pte_try_get(pmdp)) goto again; ptep = pte_offset_map(pmdp, addr); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 95d90c896580..f33db38eaafc 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1240,7 +1240,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); - if (!pte_try_get(mm, pmd)) { + if (!pte_try_get(pmd)) { result = SCAN_PMD_NULL; goto out; } @@ -1468,7 +1468,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!pmd) goto drop_hpage; - if (!pte_try_get(mm, pmd)) + if (!pte_try_get(pmd)) goto drop_hpage; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); diff --git a/mm/ksm.c b/mm/ksm.c index d0d72dd1eaf0..4a15418f1252 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1138,7 +1138,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - if (!pte_try_get(mm, pmd)) + if (!pte_try_get(pmd)) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, diff --git a/mm/madvise.c b/mm/madvise.c index 1befb4e64f2b..254811f41850 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -191,7 +191,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, struct vm_area_struct *vma = walk->private; unsigned long index; - if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd)) + if (pmd_trans_unstable_or_pte_try_get(pmd)) return 0; for (index = start; index != end; index += PAGE_SIZE) { @@ -392,7 +392,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, regular_page: #endif - if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd)) + if (pmd_trans_unstable_or_pte_try_get(pmd)) return 0; tlb_change_page_size(tlb, PAGE_SIZE); @@ -592,7 +592,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) goto next; - if (pmd_trans_unstable_or_pte_try_get(mm, pmd)) + if (pmd_trans_unstable_or_pte_try_get(pmd)) return 0; nr_put++; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 56c580d37e94..956920f96191 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5810,7 +5810,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, return 0; } - if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd)) + if (pmd_trans_unstable_or_pte_try_get(pmd)) return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) @@ -6030,7 +6030,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, return 0; } retry: - if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd)) + if (pmd_trans_unstable_or_pte_try_get(pmd)) return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { diff --git a/mm/memory.c b/mm/memory.c index 99dde124755b..6a7fe29d593b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1142,7 +1142,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, } if (pmd_none_or_clear_bad(src_pmd)) continue; - if (!pte_try_get(src_mm, src_pmd)) + if (!pte_try_get(src_pmd)) goto retry; if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, addr, next)) { @@ -1478,7 +1478,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, * because MADV_DONTNEED holds the mmap_lock in read * mode. */ - if (pmd_trans_unstable_or_pte_try_get(tlb->mm, pmd)) + if (pmd_trans_unstable_or_pte_try_get(pmd)) goto next; next = zap_pte_range(tlb, vma, pmd, addr, next, details); pte_put(tlb->mm, pmd, addr); @@ -2604,7 +2604,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, continue; pmd_clear_bad(pmd); } - if (!create && !pte_try_get(mm, pmd)) + if (!create && !pte_try_get(pmd)) goto retry; err = apply_to_pte_range(mm, pmd, addr, next, fn, data, create, mask); @@ -4077,7 +4077,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) } else if (pmd_devmap_trans_unstable(vmf->pmd)) { /* See comment in handle_pte_fault() */ return 0; - } else if (!pte_try_get(vma->vm_mm, vmf->pmd)) { + } else if (!pte_try_get(vmf->pmd)) { goto retry; } @@ -4320,7 +4320,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; goto out; } else { - if (!pte_try_get(vma->vm_mm, vmf->pmd)) { + if (!pte_try_get(vmf->pmd)) { ret = VM_FAULT_SIGBUS; goto out; } @@ -4579,7 +4579,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; - if (!pte_try_get(vmf->vma->vm_mm, vmf->pmd)) + if (!pte_try_get(vmf->pmd)) goto retry; if (IS_ENABLED(CONFIG_FREE_USER_PTE)) @@ -5000,7 +5000,7 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, (address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(range); } - if (!pte_try_get(mm, pmd)) + if (!pte_try_get(pmd)) goto out; ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!pte_present(*ptep)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9d0493f80a75..9a6d1c845a93 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -518,7 +518,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, } /* THP was split, fall through to pte walk */ - if (pmd_trans_unstable_or_pte_try_get(walk->mm, pmd)) + if (pmd_trans_unstable_or_pte_try_get(pmd)) return 0; mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); diff --git a/mm/migrate.c b/mm/migrate.c index af5b8900551b..eb0da7fb7033 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2298,7 +2298,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, if (unlikely(pmd_bad(*pmdp))) return migrate_vma_collect_skip(start, end, walk); - if (!pte_try_get(mm, pmdp)) + if (!pte_try_get(pmdp)) goto again; ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); arch_enter_lazy_mmu_mode(); diff --git a/mm/mincore.c b/mm/mincore.c index a72ec90dd54f..e47fe60c6e04 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -114,7 +114,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, goto out; } - if (pmd_trans_unstable_or_pte_try_get(walk->mm, pmd)) { + if (pmd_trans_unstable_or_pte_try_get(pmd)) { __mincore_unmapped_range(addr, end, vma, vec); goto out; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 53b412423ee8..4673604c709e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -279,7 +279,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, } /* fall through, the trans huge pmd just split */ } - if (!pte_try_get(vma->vm_mm, pmd)) + if (!pte_try_get(pmd)) goto retry; this_pages = change_pte_range(vma, pmd, addr, next, newprot, cp_flags); diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index eb84fa5825c0..c49bbff7aa60 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -259,7 +259,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) step_forward(pvmw, PMD_SIZE); continue; } - if (!pte_try_get(pvmw->vma->vm_mm, pvmw->pmd)) + if (!pte_try_get(pvmw->pmd)) goto retry; if (!map_pte(pvmw)) goto next_pte; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index da1324021429..97cd4e726a2b 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -153,7 +153,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT); } else { if (!walk->no_vma) { - if (!pte_try_get(walk->mm, pmd)) + if (!pte_try_get(pmd)) goto retry; err = walk_pte_range(pmd, addr, next, walk); pte_put(walk->mm, pmd, addr); diff --git a/mm/pte_ref.c b/mm/pte_ref.c index dff32909c7c4..ea40b1777056 100644 --- a/mm/pte_ref.c +++ b/mm/pte_ref.c @@ -26,6 +26,14 @@ static inline void pte_free_debug(pmd_t pmd) } #endif +static void pte_free_rcu(struct rcu_head *rcu) +{ + struct page *page = container_of(rcu, struct page, rcu_head); + + pgtable_pte_page_dtor(page); + __free_page(page); +} + void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr) { struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); @@ -39,7 +47,7 @@ void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr) pte_free_debug(pmd); flush_tlb_range(&vma, addr, addr + PMD_SIZE); mm_dec_nr_ptes(mm); - pte_free(mm, pmd_pgtable(pmd)); + call_rcu(&pmd_pgtable(pmd)->rcu_head, pte_free_rcu); } static inline void __pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) diff --git a/mm/swapfile.c b/mm/swapfile.c index 6db8381e1e19..47e95aceedd5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2021,7 +2021,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, do { cond_resched(); next = pmd_addr_end(addr, end); - if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd)) + if (pmd_trans_unstable_or_pte_try_get(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, type, frontswap, fs_pages_to_unuse); -- 2.11.0