[RFC PATCH 4/5] hugetlbfs: catch and handle truncate racing with page faults

Mike Kravetz <mike.kravetz@xxxxxxxxxx> · Wed, 6 Apr 2022 13:48:22 -0700

Most hugetlb fault handling code checks for faults beyond i_size.
While there are early checks in the code paths, the most difficult
to handle are those discovered after taking the page table lock.
At this point, we have possibly allocated a page and consumed
associated reservations and possibly added the page to the page cache.

When discovering a fault beyond i_size, be sure to:
- Remove the page from page cache, else it will sit there until the
  file is removed.
- Do not restore any reservation for the page consumed.  Otherwise
  there will be an outstanding reservation for an offset beyond the
  end of file.

The 'truncation' code in remove_inode_hugepages must deal with fault
code potentially removing a page from the cache after the page was
returned by pagevec_lookup and before locking the page.  This can be
discovered by a change in page_mapping().

Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
---
 fs/hugetlbfs/inode.c | 40 ++++++++++++++++++++++------------------
 mm/hugetlb.c         | 28 ++++++++++++++++++++--------
 2 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0cf352555354..341156c2a7d0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -490,13 +490,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 			 * unmapped in caller.  Unmap (again) now after taking
 			 * the fault mutex.  The mutex will prevent faults
 			 * until we finish removing the page.
-			 *
-			 * This race can only happen in the hole punch case.
-			 * Getting here in a truncate operation is a bug.
 			 */
 			if (unlikely(page_mapped(page))) {
-				BUG_ON(truncate_op);
-
 				i_mmap_lock_write(mapping);
 				hugetlb_vmdelete_list(&mapping->i_mmap,
 					index * pages_per_huge_page(h),
@@ -506,22 +501,31 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 
 			lock_page(page);
 			/*
-			 * We must free the huge page and remove from page
-			 * cache BEFORE removing the region/reserve map
-			 * (hugetlb_unreserve_pages).  In rare out of memory
-			 * conditions, removal of the region/reserve map could
-			 * fail. Correspondingly, the subpool and global
-			 * reserve usage count can need to be adjusted.
+			 * After locking page, make sure mapping is the same.
+			 * We could have raced with page fault populate and
+			 * backout code.
 			 */
-			VM_BUG_ON(HPageRestoreReserve(page));
-			hugetlb_delete_from_page_cache(page);
-			freed++;
-			if (!truncate_op) {
-				if (unlikely(hugetlb_unreserve_pages(inode,
+			if (page_mapping(page) == mapping) {
+				/*
+				 * We must free the huge page and remove from
+				 * page cache BEFORE removing the region/
+				 * reserve map (hugetlb_unreserve_pages).  In
+				 * rare out of memory conditions, removal of
+				 * the region/reserve map could fail.
+				 * Correspondingly, the subpool and global
+				 * reserve usage count can need to be adjusted.
+				 */
+				VM_BUG_ON(HPageRestoreReserve(page));
+				hugetlb_delete_from_page_cache(page);
+				freed++;
+				if (!truncate_op) {
+					if (unlikely(
+					    hugetlb_unreserve_pages(inode,
 							index, index + 1, 1)))
-					hugetlb_fix_reserve_counts(inode);
+						hugetlb_fix_reserve_counts(
+								inode);
+				}
 			}
-
 			unlock_page(page);
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c6d76f61de98..b8f994961a68 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5361,6 +5361,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	spinlock_t *ptl;
 	unsigned long haddr = address & huge_page_mask(h);
 	bool new_page, new_pagecache_page = false;
+	bool beyond_i_size = false;
+	bool reserve_alloc = false;
 
 	/*
 	 * Currently, we are forced to kill the process in the event the
@@ -5417,6 +5419,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 		clear_huge_page(page, address, pages_per_huge_page(h));
 		__SetPageUptodate(page);
 		new_page = true;
+		if (HPageRestoreReserve(page))
+			reserve_alloc = true;
 
 		if (vma->vm_flags & VM_MAYSHARE) {
 			int err = hugetlb_add_to_page_cache(page, mapping, idx);
@@ -5475,8 +5479,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 
 	ptl = huge_pte_lock(h, mm, ptep);
 	size = i_size_read(mapping->host) >> huge_page_shift(h);
-	if (idx >= size)
+	if (idx >= size) {
+		beyond_i_size = true;
 		goto backout;
+	}
 
 	ret = 0;
 	if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -5514,10 +5520,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 backout:
 	spin_unlock(ptl);
 backout_unlocked:
+	if (new_pagecache_page && beyond_i_size)
+		hugetlb_delete_from_page_cache(page);
 	unlock_page(page);
 	/* restore reserve for newly allocated pages not in page cache */
-	if (new_page && !new_pagecache_page)
-		restore_reserve_on_error(h, vma, haddr, page);
+	if (!new_pagecache_page) {
+		if (reserve_alloc)
+			SetHPageRestoreReserve(page);
+		if (new_page)
+			restore_reserve_on_error(h, vma, haddr, page);
+	}
 	put_page(page);
 	goto out;
 }
@@ -5812,15 +5824,15 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	 * Recheck the i_size after holding PT lock to make sure not
 	 * to leave any page mapped (as page_mapped()) beyond the end
 	 * of the i_size (remove_inode_hugepages() is strict about
-	 * enforcing that). If we bail out here, we'll also leave a
-	 * page in the radix tree in the vm_shared case beyond the end
-	 * of the i_size, but remove_inode_hugepages() will take care
-	 * of it as soon as we drop the hugetlb_fault_mutex_table.
+	 * enforcing that). If we bail out here, remove the page
+	 * added to the radix tree.
 	 */
 	size = i_size_read(mapping->host) >> huge_page_shift(h);
 	ret = -EFAULT;
-	if (idx >= size)
+	if (idx >= size) {
+		hugetlb_delete_from_page_cache(page);
 		goto out_release_unlock;
+	}
 
 	ret = -EEXIST;
 	if (!huge_pte_none(huge_ptep_get(dst_pte)))
-- 
2.35.1