Re: [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch

Mike Kravetz <mike.kravetz@xxxxxxxxxx> · Thu, 7 Jan 2016 08:46:06 -0800

On 01/07/2016 12:06 AM, Hillf Danton wrote:
>>
>> Page faults can race with fallocate hole punch.  If a page fault happens
>> between the unmap and remove operations, the page is not removed and
>> remains within the hole.  This is not the desired behavior.  The race
>> is difficult to detect in user level code as even in the non-race
>> case, a page within the hole could be faulted back in before fallocate
>> returns.  If userfaultfd is expanded to support hugetlbfs in the future,
>> this race will be easier to observe.
>>
>> If this race is detected and a page is mapped, the remove operation
>> (remove_inode_hugepages) will unmap the page before removing.  The unmap
>> within remove_inode_hugepages occurs with the hugetlb_fault_mutex held
>> so that no other faults will be processed until the page is removed.
>>
>> The (unmodified) routine hugetlb_vmdelete_list was moved ahead of
>> remove_inode_hugepages to satisfy the new reference.
>>
>> Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
>> ---
>>  fs/hugetlbfs/inode.c | 139 +++++++++++++++++++++++++++------------------------
>>  1 file changed, 73 insertions(+), 66 deletions(-)
>>
>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
>> index 0444760..0871d70 100644
>> --- a/fs/hugetlbfs/inode.c
>> +++ b/fs/hugetlbfs/inode.c
>> @@ -324,11 +324,46 @@ static void remove_huge_page(struct page *page)
>>  	delete_from_page_cache(page);
>>  }
>>
>> +static inline void
>> +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
>> +{
>> +	struct vm_area_struct *vma;
>> +
>> +	/*
>> +	 * end == 0 indicates that the entire range after
>> +	 * start should be unmapped.
>> +	 */
>> +	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> 
> [1] perhaps end can be reused.
> 
>> +		unsigned long v_offset;
>> +
>> +		/*
>> +		 * Can the expression below overflow on 32-bit arches?
>> +		 * No, because the interval tree returns us only those vmas
>> +		 * which overlap the truncated area starting at pgoff,
>> +		 * and no vma on a 32-bit arch can span beyond the 4GB.
>> +		 */
>> +		if (vma->vm_pgoff < start)
>> +			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
>> +		else
>> +			v_offset = 0;
>> +
>> +		if (end) {
>> +			end = ((end - start) << PAGE_SHIFT) +
>> +			       vma->vm_start + v_offset;
> 
> [2] end is input to be pgoff_t, but changed to be the type of v_offset.
> Further we cannot handle the case that end is input to be zero.
> See the diff below please.

Thanks Hillf.

This bug is part of the existing code.  I did not modify current
hugetlb_vmdelete_list code, just moved it as part of this patch.
Therefore, I will create a separate patch to fix this issue.

-- 
Mike Kravetz

> 
>> +			if (end > vma->vm_end)
>> +				end = vma->vm_end;
>> +		} else
>> +			end = vma->vm_end;
>> +
>> +		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
>> +	}
>> +}
>> +
>>
>>  /*
>>   * remove_inode_hugepages handles two distinct cases: truncation and hole
>>   * punch.  There are subtle differences in operation for each case.
>> -
>> + *
>>   * truncation is indicated by end of range being LLONG_MAX
>>   *	In this case, we first scan the range and release found pages.
>>   *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
>> @@ -379,6 +414,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
>>
>>  		for (i = 0; i < pagevec_count(&pvec); ++i) {
>>  			struct page *page = pvec.pages[i];
>> +			bool rsv_on_error;
>>  			u32 hash;
>>
>>  			/*
>> @@ -395,37 +431,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
>>  							mapping, next, 0);
>>  			mutex_lock(&hugetlb_fault_mutex_table[hash]);
>>
>> -			lock_page(page);
>> -			if (likely(!page_mapped(page))) {
>> -				bool rsv_on_error = !PagePrivate(page);
>> -				/*
>> -				 * We must free the huge page and remove
>> -				 * from page cache (remove_huge_page) BEFORE
>> -				 * removing the region/reserve map
>> -				 * (hugetlb_unreserve_pages).  In rare out
>> -				 * of memory conditions, removal of the
>> -				 * region/reserve map could fail.  Before
>> -				 * free'ing the page, note PagePrivate which
>> -				 * is used in case of error.
>> -				 */
>> -				remove_huge_page(page);
>> -				freed++;
>> -				if (!truncate_op) {
>> -					if (unlikely(hugetlb_unreserve_pages(
>> -							inode, next,
>> -							next + 1, 1)))
>> -						hugetlb_fix_reserve_counts(
>> -							inode, rsv_on_error);
>> -				}
>> -			} else {
>> -				/*
>> -				 * If page is mapped, it was faulted in after
>> -				 * being unmapped.  It indicates a race between
>> -				 * hole punch and page fault.  Do nothing in
>> -				 * this case.  Getting here in a truncate
>> -				 * operation is a bug.
>> -				 */
>> +			/*
>> +			 * If page is mapped, it was faulted in after being
>> +			 * unmapped in caller.  Unmap (again) now after taking
>> +			 * the fault mutex.  The mutex will prevent faults
>> +			 * until we finish removing the page.
>> +			 *
>> +			 * This race can only happen in the hole punch case.
>> +			 * Getting here in a truncate operation is a bug.
>> +			 */
>> +			if (unlikely(page_mapped(page))) {
>>  				BUG_ON(truncate_op);
>> +
>> +				i_mmap_lock_write(mapping);
>> +				hugetlb_vmdelete_list(&mapping->i_mmap,
>> +					next * pages_per_huge_page(h),
>> +					(next + 1) * pages_per_huge_page(h));
>> +				i_mmap_unlock_write(mapping);
>> +			}
>> +
>> +			lock_page(page);
>> +			/*
>> +			 * We must free the huge page and remove from page
>> +			 * cache (remove_huge_page) BEFORE removing the
>> +			 * region/reserve map (hugetlb_unreserve_pages).  In
>> +			 * rare out of memory conditions, removal of the
>> +			 * region/reserve map could fail.  Before free'ing
>> +			 * the page, note PagePrivate which is used in case
>> +			 * of error.
>> +			 */
>> +			rsv_on_error = !PagePrivate(page);
>> +			remove_huge_page(page);
>> +			freed++;
>> +			if (!truncate_op) {
>> +				if (unlikely(hugetlb_unreserve_pages(inode,
>> +							next, next + 1, 1)))
>> +					hugetlb_fix_reserve_counts(inode,
>> +								rsv_on_error);
>>  			}
>>
>>  			unlock_page(page);
>> @@ -452,41 +494,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
>>  	clear_inode(inode);
>>  }
>>
>> -static inline void
>> -hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
>> -{
>> -	struct vm_area_struct *vma;
>> -
>> -	/*
>> -	 * end == 0 indicates that the entire range after
>> -	 * start should be unmapped.
>> -	 */
>> -	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
>> -		unsigned long v_offset;
>> -
>> -		/*
>> -		 * Can the expression below overflow on 32-bit arches?
>> -		 * No, because the interval tree returns us only those vmas
>> -		 * which overlap the truncated area starting at pgoff,
>> -		 * and no vma on a 32-bit arch can span beyond the 4GB.
>> -		 */
>> -		if (vma->vm_pgoff < start)
>> -			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
>> -		else
>> -			v_offset = 0;
>> -
>> -		if (end) {
>> -			end = ((end - start) << PAGE_SHIFT) +
>> -			       vma->vm_start + v_offset;
>> -			if (end > vma->vm_end)
>> -				end = vma->vm_end;
>> -		} else
>> -			end = vma->vm_end;
>> -
>> -		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
>> -	}
>> -}
>> -
>>  static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
>>  {
>>  	pgoff_t pgoff;
>> --
>> 2.4.3
>>
> 
> --- a/fs/hugetlbfs/inode.c	Thu Jan  7 15:04:35 2016
> +++ b/fs/hugetlbfs/inode.c	Thu Jan  7 15:31:03 2016
> @@ -461,8 +461,11 @@ hugetlb_vmdelete_list(struct rb_root *ro
>  	 * end == 0 indicates that the entire range after
>  	 * start should be unmapped.
>  	 */
> -	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> +	if (!end)
> +		end = ULONG_MAX;
> +	vma_interval_tree_foreach(vma, root, start, end) {
>  		unsigned long v_offset;
> +		unsigned long v_end;
>  
>  		/*
>  		 * Can the expression below overflow on 32-bit arches?
> @@ -475,15 +478,12 @@ hugetlb_vmdelete_list(struct rb_root *ro
>  		else
>  			v_offset = 0;
>  
> -		if (end) {
> -			end = ((end - start) << PAGE_SHIFT) +
> +		v_end = ((end - start) << PAGE_SHIFT) +
>  			       vma->vm_start + v_offset;
> -			if (end > vma->vm_end)
> -				end = vma->vm_end;
> -		} else
> -			end = vma->vm_end;
> +		if (v_end > vma->vm_end)
> +			v_end = vma->vm_end;
>  
> -		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
> +		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, NULL);
>  	}
>  }
>  
> --
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>