RCU makes sure the pte_t* won't go away from under us. Please refer to the comment above huge_pte_offset() for more information. Some small trick is used to release RCU slightly earlier, but that should be safe and just cleaner (with rich comment in code). Signed-off-by: Peter Xu <peterx@xxxxxxxxxx> --- mm/hugetlb.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 85214095fb85..5dc87e4e6780 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6300,6 +6300,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } + /* For huge_pte_offset() */ + rcu_read_lock(); + /* * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the @@ -6324,6 +6327,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, !hugetlbfs_pagecache_present(h, vma, vaddr)) { if (pte) spin_unlock(ptl); + rcu_read_unlock(); remainder = 0; break; } @@ -6345,6 +6349,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, if (pte) spin_unlock(ptl); + rcu_read_unlock(); + if (flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; else if (unshare) @@ -6387,6 +6393,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, continue; } + /* + * When reach here, it means the pteval is not absent, so + * anyone who wants to free and invalidate the pgtable page + * (aka, pte*) should need to first unmap the entries which + * relies on the pgtable lock. Since we're holding it, + * we're safe even without RCU anymore. + * + * We can also just release RCU after each unlock of + * pgtable below, but this is just much cleaner, and also + * smaller critical section. + */ + rcu_read_unlock(); + pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte)); -- 2.37.3