Most hugetlb fault handling code checks for faults beyond i_size. While there are early checks in the code paths, the most difficult to handle are those discovered after taking the page table lock. At this point, we have possibly allocated a page and consumed associated reservations and possibly added the page to the page cache. When discovering a fault beyond i_size, be sure to: - Remove the page from page cache, else it will sit there until the file is removed. - Do not restore any reservation for the page consumed. Otherwise there will be an outstanding reservation for an offset beyond the end of file. The 'truncation' code in remove_inode_hugepages must deal with fault code potentially removing a page from the cache after the page was returned by pagevec_lookup and before locking the page. This can be discovered by a change in page_mapping(). Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx> --- fs/hugetlbfs/inode.c | 40 ++++++++++++++++++++++------------------ mm/hugetlb.c | 28 ++++++++++++++++++++-------- 2 files changed, 42 insertions(+), 26 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0cf352555354..341156c2a7d0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -490,13 +490,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * unmapped in caller. Unmap (again) now after taking * the fault mutex. The mutex will prevent faults * until we finish removing the page. - * - * This race can only happen in the hole punch case. - * Getting here in a truncate operation is a bug. */ if (unlikely(page_mapped(page))) { - BUG_ON(truncate_op); - i_mmap_lock_write(mapping); hugetlb_vmdelete_list(&mapping->i_mmap, index * pages_per_huge_page(h), @@ -506,22 +501,31 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, lock_page(page); /* - * We must free the huge page and remove from page - * cache BEFORE removing the region/reserve map - * (hugetlb_unreserve_pages). In rare out of memory - * conditions, removal of the region/reserve map could - * fail. Correspondingly, the subpool and global - * reserve usage count can need to be adjusted. + * After locking page, make sure mapping is the same. + * We could have raced with page fault populate and + * backout code. */ - VM_BUG_ON(HPageRestoreReserve(page)); - hugetlb_delete_from_page_cache(page); - freed++; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages(inode, + if (page_mapping(page) == mapping) { + /* + * We must free the huge page and remove from + * page cache BEFORE removing the region/ + * reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of + * the region/reserve map could fail. + * Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. + */ + VM_BUG_ON(HPageRestoreReserve(page)); + hugetlb_delete_from_page_cache(page); + freed++; + if (!truncate_op) { + if (unlikely( + hugetlb_unreserve_pages(inode, index, index + 1, 1))) - hugetlb_fix_reserve_counts(inode); + hugetlb_fix_reserve_counts( + inode); + } } - unlock_page(page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c6d76f61de98..b8f994961a68 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5361,6 +5361,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); bool new_page, new_pagecache_page = false; + bool beyond_i_size = false; + bool reserve_alloc = false; /* * Currently, we are forced to kill the process in the event the @@ -5417,6 +5419,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); new_page = true; + if (HPageRestoreReserve(page)) + reserve_alloc = true; if (vma->vm_flags & VM_MAYSHARE) { int err = hugetlb_add_to_page_cache(page, mapping, idx); @@ -5475,8 +5479,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, ptl = huge_pte_lock(h, mm, ptep); size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) + if (idx >= size) { + beyond_i_size = true; goto backout; + } ret = 0; if (!huge_pte_none(huge_ptep_get(ptep))) @@ -5514,10 +5520,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, backout: spin_unlock(ptl); backout_unlocked: + if (new_pagecache_page && beyond_i_size) + hugetlb_delete_from_page_cache(page); unlock_page(page); /* restore reserve for newly allocated pages not in page cache */ - if (new_page && !new_pagecache_page) - restore_reserve_on_error(h, vma, haddr, page); + if (!new_pagecache_page) { + if (reserve_alloc) + SetHPageRestoreReserve(page); + if (new_page) + restore_reserve_on_error(h, vma, haddr, page); + } put_page(page); goto out; } @@ -5812,15 +5824,15 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * Recheck the i_size after holding PT lock to make sure not * to leave any page mapped (as page_mapped()) beyond the end * of the i_size (remove_inode_hugepages() is strict about - * enforcing that). If we bail out here, we'll also leave a - * page in the radix tree in the vm_shared case beyond the end - * of the i_size, but remove_inode_hugepages() will take care - * of it as soon as we drop the hugetlb_fault_mutex_table. + * enforcing that). If we bail out here, remove the page + * added to the radix tree. */ size = i_size_read(mapping->host) >> huge_page_shift(h); ret = -EFAULT; - if (idx >= size) + if (idx >= size) { + hugetlb_delete_from_page_cache(page); goto out_release_unlock; + } ret = -EEXIST; if (!huge_pte_none(huge_ptep_get(dst_pte))) -- 2.35.1