This allows fork() to work with high-granularity mappings. The page table structure is copied such that partially mapped regions will remain partially mapped in the same way for the new process. A page's reference count is incremented for *each* portion of it that is mapped in the page table. For example, if you have a PMD-mapped 1G page, the reference count and mapcount will be incremented by 512. Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx> --- mm/hugetlb.c | 81 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 16 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5783a8307a77..7d692907cbf3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4946,7 +4946,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *src_vma) { pte_t *src_pte, *dst_pte, entry; - struct page *ptepage; + struct hugetlb_pte src_hpte, dst_hpte; + struct page *ptepage, *hpage; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); struct hstate *h = hstate_vma(src_vma); @@ -4956,6 +4957,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long last_addr_mask; int ret = 0; + if (hugetlb_hgm_enabled(src_vma)) { + /* + * src_vma might have high-granularity PTEs, and dst_vma will + * need to copy those. + */ + ret = enable_hugetlb_hgm(dst_vma); + if (ret) + return ret; + } + if (cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src, src_vma->vm_start, @@ -4967,18 +4978,22 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, /* * For shared mappings the vma lock must be held before * calling huge_pte_offset in the src vma. Otherwise, the - * returned ptep could go away if part of a shared pmd and - * another thread calls huge_pmd_unshare. + * returned ptep could go away if + * - part of a shared pmd and another thread calls + * huge_pmd_unshare, or + * - another thread collapses a high-granularity mapping. */ hugetlb_vma_lock_read(src_vma); } last_addr_mask = hugetlb_mask_last_page(h); - for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { + addr = src_vma->vm_start; + while (addr < src_vma->vm_end) { spinlock_t *src_ptl, *dst_ptl; + unsigned long hpte_sz; src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) { - addr |= last_addr_mask; + addr = (addr | last_addr_mask) + sz; continue; } dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); @@ -4987,6 +5002,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } + hugetlb_pte_populate(&src_hpte, src_pte, huge_page_shift(h), + hpage_size_to_level(huge_page_size(h))); + hugetlb_pte_populate(&dst_hpte, dst_pte, huge_page_shift(h), + hpage_size_to_level(huge_page_size(h))); + + if (hugetlb_hgm_enabled(src_vma)) { + hugetlb_hgm_walk(src, src_vma, &src_hpte, addr, + PAGE_SIZE, /*stop_at_none=*/true); + ret = hugetlb_hgm_walk(dst, dst_vma, &dst_hpte, addr, + hugetlb_pte_size(&src_hpte), + /*stop_at_none=*/false); + if (ret) + break; + + src_pte = src_hpte.ptep; + dst_pte = dst_hpte.ptep; + } + + hpte_sz = hugetlb_pte_size(&src_hpte); + /* * If the pagetables are shared don't copy or take references. * @@ -4996,12 +5031,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * to reliably determine whether pte is shared. */ if (page_count(virt_to_page(dst_pte)) > 1) { - addr |= last_addr_mask; + addr = (addr | last_addr_mask) + sz; continue; } - dst_ptl = huge_pte_lock(h, dst, dst_pte); - src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte); + dst_ptl = hugetlb_pte_lock(dst, &dst_hpte); + src_ptl = hugetlb_pte_lockptr(src, &src_hpte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); again: @@ -5042,10 +5077,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, */ if (userfaultfd_wp(dst_vma)) set_huge_pte_at(dst, addr, dst_pte, entry); + } else if (!hugetlb_pte_present_leaf(&src_hpte, entry)) { + /* Retry the walk. */ + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + continue; } else { - entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); - get_page(ptepage); + hpage = compound_head(ptepage); + get_page(hpage); /* * Failing to duplicate the anon rmap is a rare case @@ -5058,24 +5098,29 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * sleep during the process. */ if (!PageAnon(ptepage)) { - page_dup_file_rmap(ptepage, true); - } else if (page_try_dup_anon_rmap(ptepage, true, + page_dup_file_rmap(hpage, true); + } else if (page_try_dup_anon_rmap(hpage, true, src_vma)) { pte_t src_pte_old = entry; struct page *new; + if (hugetlb_hgm_enabled(src_vma)) { + ret = -EINVAL; + break; + } + spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ new = alloc_huge_page(dst_vma, addr, 1); if (IS_ERR(new)) { - put_page(ptepage); + put_page(hpage); ret = PTR_ERR(new); break; } - copy_user_huge_page(new, ptepage, addr, dst_vma, + copy_user_huge_page(new, hpage, addr, dst_vma, npages); - put_page(ptepage); + put_page(hpage); /* Install the new huge page if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); @@ -5093,6 +5138,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, hugetlb_install_page(dst_vma, dst_pte, addr, new); spin_unlock(src_ptl); spin_unlock(dst_ptl); + addr += hugetlb_pte_size(&src_hpte); continue; } @@ -5109,10 +5155,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } set_huge_pte_at(dst, addr, dst_pte, entry); - hugetlb_count_add(npages, dst); + hugetlb_count_add( + hugetlb_pte_size(&dst_hpte) / PAGE_SIZE, + dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); + addr += hugetlb_pte_size(&src_hpte); } if (cow) { -- 2.38.0.135.g90850a2211-goog