This allows fork() to work with high-granularity mappings. The page table structure is copied such that partially mapped regions will remain partially mapped in the same way for the new process. A page's reference count is incremented for *each* portion of it that is mapped in the page table. For example, if you have a PMD-mapped 1G page, the reference count and mapcount will be incremented by 512. Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx> --- mm/hugetlb.c | 75 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 25 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 718572444a73..21a5116f509b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5106,7 +5106,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *src_vma) { pte_t *src_pte, *dst_pte, entry; - struct page *ptepage; + struct hugetlb_pte src_hpte, dst_hpte; + struct page *ptepage, *hpage; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); struct hstate *h = hstate_vma(src_vma); @@ -5126,26 +5127,34 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } else { /* * For shared mappings the vma lock must be held before - * calling hugetlb_walk() in the src vma. Otherwise, the - * returned ptep could go away if part of a shared pmd and - * another thread calls huge_pmd_unshare. + * calling hugetlb_full_walk() in the src vma. Otherwise, the + * returned hpte could go away if + * - part of a shared pmd and another thread calls + * - huge_pmd_unshare, or + * - another thread collapses a high-granularity mapping. */ hugetlb_vma_lock_read(src_vma); } last_addr_mask = hugetlb_mask_last_page(h); - for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { + addr = src_vma->vm_start; + while (addr < src_vma->vm_end) { spinlock_t *src_ptl, *dst_ptl; - src_pte = hugetlb_walk(src_vma, addr, sz); - if (!src_pte) { - addr |= last_addr_mask; + unsigned long hpte_sz; + + if (hugetlb_full_walk(&src_hpte, src_vma, addr)) { + addr = (addr | last_addr_mask) + sz; continue; } - dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); - if (!dst_pte) { - ret = -ENOMEM; + ret = hugetlb_full_walk_alloc(&dst_hpte, dst_vma, addr, + hugetlb_pte_size(&src_hpte)); + if (ret) break; - } + + src_pte = src_hpte.ptep; + dst_pte = dst_hpte.ptep; + + hpte_sz = hugetlb_pte_size(&src_hpte); /* * If the pagetables are shared don't copy or take references. @@ -5155,13 +5164,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * another vma. So page_count of ptep page is checked instead * to reliably determine whether pte is shared. */ - if (page_count(virt_to_page(dst_pte)) > 1) { - addr |= last_addr_mask; + if (hugetlb_pte_size(&dst_hpte) == sz && + page_count(virt_to_page(dst_pte)) > 1) { + addr = (addr | last_addr_mask) + sz; continue; } - dst_ptl = huge_pte_lock(h, dst, dst_pte); - src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte); + dst_ptl = hugetlb_pte_lock(&dst_hpte); + src_ptl = hugetlb_pte_lockptr(&src_hpte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); again: @@ -5205,10 +5215,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, */ if (userfaultfd_wp(dst_vma)) set_huge_pte_at(dst, addr, dst_pte, entry); + } else if (!hugetlb_pte_present_leaf(&src_hpte, entry)) { + /* Retry the walk. */ + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + continue; } else { - entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); - get_page(ptepage); + hpage = compound_head(ptepage); + get_page(hpage); /* * Failing to duplicate the anon rmap is a rare case @@ -5220,25 +5235,31 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * need to be without the pgtable locks since we could * sleep during the process. */ - if (!PageAnon(ptepage)) { - page_dup_file_rmap(ptepage, true); - } else if (page_try_dup_anon_rmap(ptepage, true, + if (!PageAnon(hpage)) { + page_dup_file_rmap(hpage, true); + } else if (page_try_dup_anon_rmap(hpage, true, src_vma)) { pte_t src_pte_old = entry; struct page *new; + if (hugetlb_pte_size(&src_hpte) != sz) { + put_page(hpage); + ret = -EINVAL; + break; + } + spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ new = alloc_huge_page(dst_vma, addr, 1); if (IS_ERR(new)) { - put_page(ptepage); + put_page(hpage); ret = PTR_ERR(new); break; } - copy_user_huge_page(new, ptepage, addr, dst_vma, + copy_user_huge_page(new, hpage, addr, dst_vma, npages); - put_page(ptepage); + put_page(hpage); /* Install the new huge page if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); @@ -5256,6 +5277,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, hugetlb_install_page(dst_vma, dst_pte, addr, new); spin_unlock(src_ptl); spin_unlock(dst_ptl); + addr += hugetlb_pte_size(&src_hpte); continue; } @@ -5272,10 +5294,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } set_huge_pte_at(dst, addr, dst_pte, entry); - hugetlb_count_add(npages, dst); + hugetlb_count_add( + hugetlb_pte_size(&dst_hpte) / PAGE_SIZE, + dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); + addr += hugetlb_pte_size(&src_hpte); } if (cow) { -- 2.39.0.314.g84b9a713c41-goog