When a pmd is 'unshared' it effectivelly deletes part of a processes page tables. The routine huge_pmd_unshare must be called with i_mmap_rwsem held in write mode and the page table locked. However, consider a page fault happening within that same process. We could have the following race: Faulting thread Unsharing thread ... ... ptep = huge_pte_offset() or ptep = huge_pte_alloc() ... i_mmap_unlock_write lock_page table ptep invalid <------------------------ huge_pmd_unshare Could be in a previously unlock_page_table sharing process or worse ... ptl = huge_pte_lock(ptep) get/update pte set_pte_at(pte, ptep) If the above race happens, we can update the pte of another process. Catch this situation by doing another huge_pte_offset/page table walk after obtaining the page table lock and compare pointers. If the pointers are different, then we know a race happened and we can bail and cleanup. In fault code, make sure to check for this race AFTER checking for faults beyond i_size so page cache can be cleaned up properly. Do note that even this is not perfect. The page table lock is in the page struct of the pmd page. We need the pmd pointer (ptep) to get the page table lock. As shown above, we can not even be certain ptep is still valid when getting/locking the page table. The other option is to always use 'mm->page_table_lock' for hugetlb page table. Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx> --- mm/hugetlb.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b8f994961a68..e5196f0fa09c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4695,6 +4695,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pte_t *src_pte, *dst_pte, entry, dst_entry; + pte_t *src_pte2; struct page *ptepage; unsigned long addr; bool cow = is_cow_mapping(vma->vm_flags); @@ -4741,7 +4742,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); dst_entry = huge_ptep_get(dst_pte); again: - if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { + + src_pte2 = huge_pte_offset(src, addr, sz); + if (unlikely(src_pte2 != src_pte)) { + /* + * Another thread could have unshared src_pte. + * Just skip. + */ + ; + } else if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { /* * Skip if src entry none. Also, skip in the * unlikely case dst entry !none as this implies @@ -5363,6 +5372,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, bool new_page, new_pagecache_page = false; bool beyond_i_size = false; bool reserve_alloc = false; + pte_t *ptep2; /* * Currently, we are forced to kill the process in the event the @@ -5410,8 +5420,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * sure there really is no pte entry. */ ptl = huge_pte_lock(h, mm, ptep); + /* ptep2 checks for racing unshare page tables */ + ptep2 = huge_pte_offset(mm, haddr, huge_page_size(h)); ret = 0; - if (huge_pte_none(huge_ptep_get(ptep))) + if (ptep2 == ptep && huge_pte_none(huge_ptep_get(ptep))) ret = vmf_error(PTR_ERR(page)); spin_unlock(ptl); goto out; @@ -5484,6 +5496,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto backout; } + /* Check for racing unshare page tables */ + ptep2 = huge_pte_offset(mm, haddr, huge_page_size(h)); + if (ptep2 != ptep) + goto backout; + ret = 0; if (!huge_pte_none(huge_ptep_get(ptep))) goto backout; @@ -5561,7 +5578,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - pte_t *ptep, entry; + pte_t *ptep, *ptep2, entry; spinlock_t *ptl; vm_fault_t ret; u32 hash; @@ -5640,8 +5657,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptl = huge_pte_lock(h, mm, ptep); - /* Check for a racing update before calling hugetlb_cow */ - if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + /* Check for a racing update or unshare before calling hugetlb_cow */ + if (unlikely(ptep2 != ptep || !pte_same(entry, huge_ptep_get(ptep)))) goto out_ptl; /* @@ -5720,6 +5737,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct page *page; int writable; bool page_in_pagecache = false; + pte_t *ptep2; if (is_continue) { ret = -EFAULT; @@ -5834,6 +5852,11 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_unlock; } + /* Check for racing unshare page tables */ + ptep2 = huge_pte_offset(dst_mm, dst_addr, huge_page_size(h)); + if (unlikely(ptep2 != dst_pte)) + goto out_release_unlock; + ret = -EEXIST; if (!huge_pte_none(huge_ptep_get(dst_pte))) goto out_release_unlock; -- 2.35.1