The userfaultfd hugetlb tests detect a resv_huge_pages underflow. This happens when hugetlb_mcopy_atomic_pte() is called with !is_continue on an index for which we already have a page in the cache. When this happens, we allocate a second page, double consuming the reservation, and then fail to insert the page into the cache and return -EEXIST. To fix this, we first if there exists a page in the cache which already consumed the reservation, and return -EEXIST immediately if so. Secondly, if we fail to copy the page contents while holding the hugetlb_fault_mutex, we will drop the mutex and return to the caller after allocating a page that consumed a reservation. In this case there may be a fault that double consumes the reservation. To handle this, we free the allocated page, fix the reservations, and allocate a temporary hugetlb page and return that to the caller. When the caller does the copy outside of the lock, we again check the cache, and allocate a page consuming the reservation, and copy over the contents. Test: Hacked the code locally such that resv_huge_pages underflows produce a warning and the copy_huge_page_from_user() always fails, then: ./tools/testing/selftests/vm/userfaultfd hugetlb_shared 10 2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success ./tools/testing/selftests/vm/userfaultfd hugetlb 10 2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success Both tests succeed and produce no warnings. After the test runs number of free/resv hugepages is correct. Signed-off-by: Mina Almasry <almasrymina@xxxxxxxxxx> Cc: Axel Rasmussen <axelrasmussen@xxxxxxxxxx> Cc: Peter Xu <peterx@xxxxxxxxxx> Cc: linux-mm@xxxxxxxxx Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: linux-mm@xxxxxxxxx Cc: linux-kernel@xxxxxxxxxxxxxxx --- include/linux/hugetlb.h | 4 ++ mm/hugetlb.c | 103 ++++++++++++++++++++++++++++++++++++---- mm/migrate.c | 39 +++------------ 3 files changed, 103 insertions(+), 43 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b92f25ccef58..427974510965 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -194,6 +194,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, bool is_hugetlb_entry_migration(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); +void hugetlb_copy_page(struct page *dst, struct page *src); + #else /* !CONFIG_HUGETLB_PAGE */ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) @@ -379,6 +381,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } +static inline void hugetlb_copy_page(struct page *dst, struct page *src); + #endif /* !CONFIG_HUGETLB_PAGE */ /* * hugepages at page global directory. If arch support diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 629aa4c2259c..cb041c97a558 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -81,6 +81,45 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); +/* + * Gigantic pages are so large that we do not guarantee that page++ pointer + * arithmetic will work across the entire page. We need something more + * specialized. + */ +static void __copy_gigantic_page(struct page *dst, struct page *src, + int nr_pages) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < nr_pages;) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void hugetlb_copy_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + int nr_pages = pages_per_huge_page(h); + + if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { + __copy_gigantic_page(dst, src, nr_pages); + return; + } + + for (i = 0; i < nr_pages; i++) { + cond_resched(); + copy_highpage(dst + i, src + i); + } +} + static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) @@ -4868,19 +4907,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct page **pagep) { bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); - struct address_space *mapping; - pgoff_t idx; + struct hstate *h = hstate_vma(dst_vma); + struct address_space *mapping = dst_vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); unsigned long size; int vm_shared = dst_vma->vm_flags & VM_SHARED; - struct hstate *h = hstate_vma(dst_vma); pte_t _dst_pte; spinlock_t *ptl; - int ret; + int ret = -ENOMEM; struct page *page; int writable; - - mapping = dst_vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, dst_vma, dst_addr); + struct mempolicy *mpol; + nodemask_t *nodemask; + gfp_t gfp_mask = htlb_alloc_mask(h); + int node = huge_node(dst_vma, dst_addr, gfp_mask, &mpol, &nodemask); if (is_continue) { ret = -EFAULT; @@ -4888,7 +4928,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!page) goto out; } else if (!*pagep) { - ret = -ENOMEM; + /* If a page already exists, then it's UFFDIO_COPY for + * a non-missing case. Return -EEXIST. + */ + if (hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + ret = -EEXIST; + goto out; + } + page = alloc_huge_page(dst_vma, dst_addr, 0); if (IS_ERR(page)) goto out; @@ -4900,12 +4947,48 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; + /* Free the allocated page which may have + * consumed a reservation. + */ + restore_reserve_on_error(h, dst_vma, dst_addr, page); + if (!HPageRestoreReserve(page)) { + if (unlikely(hugetlb_unreserve_pages( + mapping->host, idx, idx + 1, 1))) + hugetlb_fix_reserve_counts( + mapping->host); + } + put_page(page); + + /* Allocate a temporary page to hold the copied + * contents. + */ + page = alloc_migrate_huge_page(h, gfp_mask, node, + nodemask); + if (IS_ERR(page)) { + ret = -ENOMEM; + goto out; + } *pagep = page; - /* don't free the page */ + /* Set the outparam pagep and return to the caller to + * copy the contents outside the lock. Don't free the + * page. + */ goto out; } } else { - page = *pagep; + if (hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + put_page(*pagep); + ret = -EEXIST; + goto out; + } + + page = alloc_huge_page(dst_vma, dst_addr, 0); + if (IS_ERR(page)) { + ret = -ENOMEM; + goto out; + } + __copy_gigantic_page(page, *pagep, pages_per_huge_page(h)); + put_page(*pagep); *pagep = NULL; } diff --git a/mm/migrate.c b/mm/migrate.c index 6b37d00890ca..d3437f9a608d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -528,28 +528,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return MIGRATEPAGE_SUCCESS; } -/* - * Gigantic pages are so large that we do not guarantee that page++ pointer - * arithmetic will work across the entire page. We need something more - * specialized. - */ -static void __copy_gigantic_page(struct page *dst, struct page *src, - int nr_pages) -{ - int i; - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < nr_pages; ) { - cond_resched(); - copy_highpage(dst, src); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - static void copy_huge_page(struct page *dst, struct page *src) { int i; @@ -557,19 +535,14 @@ static void copy_huge_page(struct page *dst, struct page *src) if (PageHuge(src)) { /* hugetlbfs page */ - struct hstate *h = page_hstate(src); - nr_pages = pages_per_huge_page(h); - - if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { - __copy_gigantic_page(dst, src, nr_pages); - return; - } - } else { - /* thp page */ - BUG_ON(!PageTransHuge(src)); - nr_pages = thp_nr_pages(src); + hugetlb_copy_page(dst, src); + return; } + /* thp page */ + BUG_ON(!PageTransHuge(src)); + nr_pages = thp_nr_pages(src); + for (i = 0; i < nr_pages; i++) { cond_resched(); copy_highpage(dst + i, src + i); -- 2.31.1.818.g46aad6cb9e-goog