Re: [PATCH 1/6] xfs: fix inode size update overflow in xfs_map_direct()

Brian Foster <bfoster@xxxxxxxxxx> · Thu, 29 Oct 2015 10:27:32 -0400



On Mon, Oct 19, 2015 at 02:27:13PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> Both direct IO and DAX pass an offset and count into get_blocks that
> will overflow a s64 variable when an IO goes into the last supported
> block in a file (i.e. at offset 2^63 - 1FSB bytes). This can be seen
> from the tracing:
> 
> xfs_get_blocks_alloc: [...] offset 0x7ffffffffffff000 count 4096
> xfs_gbmap_direct:     [...] offset 0x7ffffffffffff000 count 4096
> xfs_gbmap_direct_none:[...] offset 0x7ffffffffffff000 count 4096
> 
> 0x7ffffffffffff000 + 4096 = 0x8000000000000000, and hence that
> overflows the s64 offset and we fail to detect the need for a
> filesize update and an ioend is not allocated.
> 
> This is *mostly* avoided for direct IO because such extending IOs
> occur with full block allocation, and so the "IS_UNWRITTEN()" check
> still evaluates as true and we get an ioend that way. However, doing
> single sector extending IOs to this last block will expose the fact
> that file size updates will not occur after the first allocating
> direct IO as the overflow will then be exposed.
> 
> There is one further complexity: the DAX page fault path also
> exposes the same issue in block allocation. However, page faults
> cannot extend the file size, so in this case we want to allocate the
> block but do not want to allocate an ioend to enable file size
> update at IO completion. Hence we now need to distinguish between
> the direct IO patch allocation and dax fault path allocation to
> avoid leaking ioend structures.
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> ---

Reviewed-by: Brian Foster <bfoster@xxxxxxxxxx>

>  fs/xfs/xfs_aops.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------
>  fs/xfs/xfs_aops.h |  2 ++
>  fs/xfs/xfs_file.c |  6 +++---
>  3 files changed, 49 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index e4fff58..366e41eb 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -1259,13 +1259,28 @@ xfs_vm_releasepage(
>   * the DIO. There is only going to be one reference to the ioend and its life
>   * cycle is constrained by the DIO completion code. hence we don't need
>   * reference counting here.
> + *
> + * Note that for DIO, an IO to the highest supported file block offset (i.e.
> + * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
> + * bit variable. Hence if we see this overflow, we have to assume that the IO is
> + * extending the file size. We won't know for sure until IO completion is run
> + * and the actual max write offset is communicated to the IO completion
> + * routine.
> + *
> + * For DAX page faults, we are preparing to never see unwritten extents here,
> + * nor should we ever extend the inode size. Hence we will soon have nothing to
> + * do here for this case, ensuring we don't have to provide an IO completion
> + * callback to free an ioend that we don't actually need for a fault into the
> + * page at offset (2^63 - 1FSB) bytes.
>   */
> +
>  static void
>  xfs_map_direct(
>  	struct inode		*inode,
>  	struct buffer_head	*bh_result,
>  	struct xfs_bmbt_irec	*imap,
> -	xfs_off_t		offset)
> +	xfs_off_t		offset,
> +	bool			dax_fault)
>  {
>  	struct xfs_ioend	*ioend;
>  	xfs_off_t		size = bh_result->b_size;
> @@ -1278,6 +1293,16 @@ xfs_map_direct(
>  
>  	trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
>  
> +	/* XXX: preparation for removing unwritten extents in DAX */
> +#if 0
> +	if (dax_fault) {
> +		ASSERT(type == XFS_IO_OVERWRITE);
> +		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
> +					    imap);
> +		return;
> +	}
> +#endif
> +
>  	if (bh_result->b_private) {
>  		ioend = bh_result->b_private;
>  		ASSERT(ioend->io_size > 0);
> @@ -1292,7 +1317,8 @@ xfs_map_direct(
>  					      ioend->io_size, ioend->io_type,
>  					      imap);
>  	} else if (type == XFS_IO_UNWRITTEN ||
> -		   offset + size > i_size_read(inode)) {
> +		   offset + size > i_size_read(inode) ||
> +		   offset + size < 0) {
>  		ioend = xfs_alloc_ioend(inode, type);
>  		ioend->io_offset = offset;
>  		ioend->io_size = size;
> @@ -1354,7 +1380,8 @@ __xfs_get_blocks(
>  	sector_t		iblock,
>  	struct buffer_head	*bh_result,
>  	int			create,
> -	bool			direct)
> +	bool			direct,
> +	bool			dax_fault)
>  {
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	struct xfs_mount	*mp = ip->i_mount;
> @@ -1467,7 +1494,8 @@ __xfs_get_blocks(
>  			set_buffer_unwritten(bh_result);
>  		/* direct IO needs special help */
>  		if (create && direct)
> -			xfs_map_direct(inode, bh_result, &imap, offset);
> +			xfs_map_direct(inode, bh_result, &imap, offset,
> +				       dax_fault);
>  	}
>  
>  	/*
> @@ -1514,7 +1542,7 @@ xfs_get_blocks(
>  	struct buffer_head	*bh_result,
>  	int			create)
>  {
> -	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
> +	return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
>  }
>  
>  int
> @@ -1524,7 +1552,17 @@ xfs_get_blocks_direct(
>  	struct buffer_head	*bh_result,
>  	int			create)
>  {
> -	return __xfs_get_blocks(inode, iblock, bh_result, create, true);
> +	return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
> +}
> +
> +int
> +xfs_get_blocks_dax_fault(
> +	struct inode		*inode,
> +	sector_t		iblock,
> +	struct buffer_head	*bh_result,
> +	int			create)
> +{
> +	return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
>  }
>  
>  static void
> diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> index 86afd1a..d39ba25 100644
> --- a/fs/xfs/xfs_aops.h
> +++ b/fs/xfs/xfs_aops.h
> @@ -58,6 +58,8 @@ int	xfs_get_blocks(struct inode *inode, sector_t offset,
>  		       struct buffer_head *map_bh, int create);
>  int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
>  			      struct buffer_head *map_bh, int create);
> +int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
> +			         struct buffer_head *map_bh, int create);
>  void	xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
>  
>  extern void xfs_count_page_state(struct page *, int *, int *);
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 2f7b6bd..7f873bc 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1508,7 +1508,7 @@ xfs_filemap_page_mkwrite(
>  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  
>  	if (IS_DAX(inode)) {
> -		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
> +		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault,
>  				    xfs_end_io_dax_write);
>  	} else {
>  		ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
> @@ -1543,7 +1543,7 @@ xfs_filemap_fault(
>  		 * changes to xfs_get_blocks_direct() to map unwritten extent
>  		 * ioend for conversion on read-only mappings.
>  		 */
> -		ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
> +		ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
>  	} else
>  		ret = filemap_fault(vma, vmf);
>  	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> @@ -1570,7 +1570,7 @@ xfs_filemap_pmd_fault(
>  	sb_start_pagefault(inode->i_sb);
>  	file_update_time(vma->vm_file);
>  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> -	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
> +	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
>  				    xfs_end_io_dax_write);
>  	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  	sb_end_pagefault(inode->i_sb);
> -- 
> 2.5.0
> 
> _______________________________________________
> xfs mailing list
> xfs@xxxxxxxxxxx
> http://oss.sgi.com/mailman/listinfo/xfs

_______________________________________________
xfs mailing list
xfs@xxxxxxxxxxx
http://oss.sgi.com/mailman/listinfo/xfs