Update the page fault handler to support high-granularity page faults. While handling a page fault on a partially-mapped HugeTLB page, if the PTE we find with hugetlb_pte_walk is none, then we will replace it with a leaf-level PTE to map the page. To give some examples: 1. For a completely unmapped 1G page, it will be mapped with a 1G PUD. 2. For a 1G page that has its first 512M mapped, any faults on the unmapped sections will result in 2M PMDs mapping each unmapped 2M section. 3. For a 1G page that has only its first 4K mapped, a page fault on its second 4K section will get a 4K PTE to map it. Unless high-granularity mappings are created via UFFDIO_CONTINUE, it is impossible for hugetlb_fault to create high-granularity mappings. This commit does not handle hugetlb_wp right now, and it doesn't handle HugeTLB page migration and swap entries. Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx> --- mm/hugetlb.c | 90 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 26 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 16b0d192445c..2ee2c48ee79c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -118,6 +118,18 @@ enum hugetlb_level hpage_size_to_level(unsigned long sz) return HUGETLB_LEVEL_PGD; } +/* + * Find the subpage that corresponds to `addr` in `hpage`. + */ +static struct page *hugetlb_find_subpage(struct hstate *h, struct page *hpage, + unsigned long addr) +{ + size_t idx = (addr & ~huge_page_mask(h))/PAGE_SIZE; + + BUG_ON(idx >= pages_per_huge_page(h)); + return &hpage[idx]; +} + static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) @@ -5810,13 +5822,13 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, * false if pte changed or is changing. */ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, - pte_t *ptep, pte_t old_pte) + struct hugetlb_pte *hpte, pte_t old_pte) { spinlock_t *ptl; bool same; - ptl = huge_pte_lock(h, mm, ptep); - same = pte_same(huge_ptep_get(ptep), old_pte); + ptl = hugetlb_pte_lock(mm, hpte); + same = pte_same(huge_ptep_get(hpte->ptep), old_pte); spin_unlock(ptl); return same; @@ -5825,17 +5837,18 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, - unsigned long address, pte_t *ptep, + unsigned long address, struct hugetlb_pte *hpte, pte_t old_pte, unsigned int flags) { struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; - struct page *page; + struct page *page, *subpage; pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); + unsigned long haddr_hgm = address & hugetlb_pte_mask(hpte); bool new_page, new_pagecache_page = false; u32 hash = hugetlb_fault_mutex_hash(mapping, idx); @@ -5880,7 +5893,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * never happen on the page after UFFDIO_COPY has * correctly installed the page and returned. */ - if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + if (!hugetlb_pte_stable(h, mm, hpte, old_pte)) { ret = 0; goto out; } @@ -5904,7 +5917,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - if (hugetlb_pte_stable(h, mm, ptep, old_pte)) + if (hugetlb_pte_stable(h, mm, hpte, old_pte)) ret = vmf_error(PTR_ERR(page)); else ret = 0; @@ -5954,7 +5967,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, unlock_page(page); put_page(page); /* See comment in userfaultfd_missing() block above */ - if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + if (!hugetlb_pte_stable(h, mm, hpte, old_pte)) { ret = 0; goto out; } @@ -5979,10 +5992,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, vma_end_reservation(h, vma, haddr); } - ptl = huge_pte_lock(h, mm, ptep); + ptl = hugetlb_pte_lock(mm, hpte); ret = 0; /* If pte changed from under us, retry */ - if (!pte_same(huge_ptep_get(ptep), old_pte)) + if (!pte_same(huge_ptep_get(hpte->ptep), old_pte)) goto backout; if (anon_rmap) { @@ -5990,20 +6003,25 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugepage_add_new_anon_rmap(page, vma, haddr); } else page_dup_file_rmap(page, true); - new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) - && (vma->vm_flags & VM_SHARED))); + + subpage = hugetlb_find_subpage(h, page, haddr_hgm); + new_pte = make_huge_pte_with_shift(vma, subpage, + ((vma->vm_flags & VM_WRITE) + && (vma->vm_flags & VM_SHARED)), + hpte->shift); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ if (unlikely(pte_marker_uffd_wp(old_pte))) new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte)); - set_huge_pte_at(mm, haddr, ptep, new_pte); + set_huge_pte_at(mm, haddr_hgm, hpte->ptep, new_pte); - hugetlb_count_add(pages_per_huge_page(h), mm); + hugetlb_count_add(hugetlb_pte_size(hpte) / PAGE_SIZE, mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + BUG_ON(hugetlb_pte_size(hpte) != huge_page_size(h)); /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl); + ret = hugetlb_wp(mm, vma, address, hpte->ptep, flags, page, ptl); } spin_unlock(ptl); @@ -6066,11 +6084,14 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, u32 hash; pgoff_t idx; struct page *page = NULL; + struct page *subpage = NULL; struct page *pagecache_page = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; unsigned long haddr = address & huge_page_mask(h); + unsigned long haddr_hgm; + struct hugetlb_pte hpte; ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { @@ -6115,15 +6136,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } - entry = huge_ptep_get(ptep); + hugetlb_pte_populate(&hpte, ptep, huge_page_shift(h), + hpage_size_to_level(huge_page_size(h))); + /* Do a high-granularity page table walk. */ + hugetlb_hgm_walk(mm, vma, &hpte, address, PAGE_SIZE, + /*stop_at_none=*/true); + + entry = huge_ptep_get(hpte.ptep); /* PTE markers should be handled the same way as none pte */ - if (huge_pte_none_mostly(entry)) + if (huge_pte_none_mostly(entry)) { /* * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ - return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, + return hugetlb_no_page(mm, vma, mapping, idx, address, &hpte, entry, flags); + } ret = 0; @@ -6137,6 +6165,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_present(entry)) goto out_mutex; + if (!hugetlb_pte_present_leaf(&hpte, entry)) + /* We raced with someone splitting the entry. */ + goto out_mutex; + /* * If we are going to COW/unshare the mapping later, we examine the * pending reservations for this page now. This will ensure that any @@ -6156,14 +6188,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pagecache_page = find_lock_page(mapping, idx); } - ptl = huge_pte_lock(h, mm, ptep); + ptl = hugetlb_pte_lock(mm, &hpte); /* Check for a racing update before calling hugetlb_wp() */ - if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + if (unlikely(!pte_same(entry, huge_ptep_get(hpte.ptep)))) goto out_ptl; + /* haddr_hgm is the base address of the region that hpte maps. */ + haddr_hgm = address & hugetlb_pte_mask(&hpte); + /* Handle userfault-wp first, before trying to lock more pages */ - if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && + if (userfaultfd_wp(vma) && huge_pte_uffd_wp(entry) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { struct vm_fault vmf = { .vma = vma, @@ -6187,7 +6222,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * pagecache_page, so here we need take the former one * when page != pagecache_page or !pagecache_page. */ - page = pte_page(entry); + subpage = pte_page(entry); + page = compound_head(subpage); if (page != pagecache_page) if (!trylock_page(page)) { need_wait_lock = 1; @@ -6198,7 +6234,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { - ret = hugetlb_wp(mm, vma, address, ptep, flags, + BUG_ON(hugetlb_pte_size(&hpte) != huge_page_size(h)); + ret = hugetlb_wp(mm, vma, address, hpte.ptep, flags, pagecache_page, ptl); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { @@ -6206,9 +6243,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } } entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, + if (huge_ptep_set_access_flags(vma, haddr_hgm, hpte.ptep, entry, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, haddr, ptep); + update_mmu_cache(vma, haddr_hgm, hpte.ptep); out_put_page: if (page != pagecache_page) unlock_page(page); @@ -7598,7 +7635,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte = (pte_t *)pmd_alloc(mm, pud, addr); } } - BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); + BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte) && + !hugetlb_hgm_enabled(vma)); return pte; } -- 2.38.0.135.g90850a2211-goog