Re: [PATCH 10/21] xfs: remove xfs_map_cow

Brian Foster <bfoster@xxxxxxxxxx> · Mon, 4 Jun 2018 08:25:33 -0400

On Thu, May 31, 2018 at 08:07:48PM +0200, Christoph Hellwig wrote:
> We can handle the existing cow mapping case as a special case directly
> in xfs_writepage_map, and share code for allocating delalloc blocks
> with regular I/O in xfs_map_blocks.  This means we need to always
> call xfs_map_blocks for reflink inodes, but we can still skip most of
> the work if it turns out that there is no COW mapping overlapping the
> current block.
> 
> As a subtle detail we need to start caching holes in the wpc to deal
> with the case of COW reservations between EOF.  But we'll need that
> infrastructure later anyway, so this is no big deal.
> 
> Based on a patch from Dave Chinner.
> 
> Signed-off-by: Christoph Hellwig <hch@xxxxxx>
> ---
>  fs/xfs/xfs_aops.c | 184 ++++++++++++++++++++++------------------------
>  fs/xfs/xfs_aops.h |   4 +-
>  2 files changed, 90 insertions(+), 98 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 9b5a4b6e268c..d536509ca05b 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -375,70 +375,107 @@ xfs_end_bio(
>  
>  STATIC int
>  xfs_map_blocks(
> +	struct xfs_writepage_ctx *wpc,
>  	struct inode		*inode,
> -	loff_t			offset,
> -	struct xfs_bmbt_irec	*imap,
> -	int			type)
> +	loff_t			offset)
>  {
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	struct xfs_mount	*mp = ip->i_mount;
>  	ssize_t			count = i_blocksize(inode);
> -	xfs_fileoff_t		offset_fsb, end_fsb;
> +	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
> +	struct xfs_bmbt_irec	imap;
> +	int			whichfork = XFS_DATA_FORK;
>  	int			error = 0;
>  	int			nimaps = 1;
>  
>  	if (XFS_FORCED_SHUTDOWN(mp))
>  		return -EIO;
>  
> -	/*
> -	 * Truncate can race with writeback since writeback doesn't take the
> -	 * iolock and truncate decreases the file size before it starts
> -	 * truncating the pages between new_size and old_size.  Therefore, we
> -	 * can end up in the situation where writeback gets a CoW fork mapping
> -	 * but the truncate makes the mapping invalid and we end up in here
> -	 * trying to get a new mapping.  Bail out here so that we simply never
> -	 * get a valid mapping and so we drop the write altogether.  The page
> -	 * truncation will kill the contents anyway.
> -	 */
> -	if (type == XFS_IO_COW && offset > i_size_read(inode))
> -		return 0;
> -
> -	ASSERT(type != XFS_IO_COW);
> -
>  	xfs_ilock(ip, XFS_ILOCK_SHARED);
>  	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
>  	       (ip->i_df.if_flags & XFS_IFEXTENTS));
>  	ASSERT(offset <= mp->m_super->s_maxbytes);
>  
> +	if (xfs_is_reflink_inode(ip) &&
> +	    xfs_reflink_find_cow_mapping(ip, offset, &imap)) {
> +		xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +		/*
> +		 * Truncate can race with writeback since writeback doesn't
> +		 * take the iolock and truncate decreases the file size before
> +		 * it starts truncating the pages between new_size and old_size.
> +		 * Therefore, we can end up in the situation where writeback
> +		 * gets a CoW fork mapping but the truncate makes the mapping
> +		 * invalid and we end up in here trying to get a new mapping.
> +		 * bail out here so that we simply never get a valid mapping
> +		 * and so we drop the write altogether.  The page truncation
> +		 * will kill the contents anyway.
> +		 */
> +		if (offset > i_size_read(inode)) {
> +			whichfork = XFS_IO_HOLE;

Not sure what the point of this assignment is if we're going to return,
but that's not a valid fork either way. Did you mean to assign
wpc->io_type?

> +			return 0;
> +		}
> +		whichfork = XFS_COW_FORK;
> +		wpc->io_type = XFS_IO_COW;
> +		goto allocate_blocks;
> +	}
> +
> +	/*
> +	 * Map valid and no COW extent in the way?  We're done.
> +	 */
> +	if (wpc->imap_valid) {
> +		xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +		return 0;
> +	}
> +
> +	/*
> +	 * If we don't have a valid map, now it's time to get a new one for this
> +	 * offset.  This will convert delayed allocations (including COW ones)
> +	 * into real extents.
> +	 */
>  	if (offset > mp->m_super->s_maxbytes - count)
>  		count = mp->m_super->s_maxbytes - offset;
>  	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
>  	offset_fsb = XFS_B_TO_FSBT(mp, offset);
>  	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
> -				imap, &nimaps, XFS_BMAPI_ENTIRE);
> +				&imap, &nimaps, XFS_BMAPI_ENTIRE);
>  	xfs_iunlock(ip, XFS_ILOCK_SHARED);
> -
>  	if (error)
>  		return error;
>  
> -	if (type == XFS_IO_DELALLOC &&
> -	    (!nimaps || isnullstartblock(imap->br_startblock))) {
> -		error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset,
> -				imap);
> -		if (!error)
> -			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
> -		return error;
> +	if (!nimaps) {
> +		/*
> +		 * Lookup returns no match? Beyond eof? regardless,
> +		 * return it as a hole so we don't write it
> +		 */
> +		imap.br_startoff = offset_fsb;
> +		imap.br_blockcount = end_fsb - offset_fsb;
> +		imap.br_startblock = HOLESTARTBLOCK;
> +		wpc->io_type = XFS_IO_HOLE;
> +	} else if (imap.br_startblock == HOLESTARTBLOCK) {
> +		/* landed in a hole */
> +		wpc->io_type = XFS_IO_HOLE;
>  	}
>  
> +	if (wpc->io_type == XFS_IO_DELALLOC &&
> +	    (!nimaps || isnullstartblock(imap.br_startblock)))
> +		goto allocate_blocks;
> +
>  #ifdef DEBUG
> -	if (type == XFS_IO_UNWRITTEN) {
> +	if (wpc->io_type == XFS_IO_UNWRITTEN) {
>  		ASSERT(nimaps);
> -		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
> -		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
> +		ASSERT(imap.br_startblock != HOLESTARTBLOCK);
> +		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
>  	}
>  #endif
> -	if (nimaps)
> -		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
> +	wpc->imap = imap;
> +	trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
> +	return 0;
> +allocate_blocks:
> +	error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap);
> +	if (error)
> +		return error;
> +	wpc->imap = imap;
> +	trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
>  	return 0;
>  }
>  
> @@ -759,56 +796,6 @@ xfs_aops_discard_page(
>  	xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
>  }
>  
> -static int
> -xfs_map_cow(
> -	struct xfs_writepage_ctx *wpc,
> -	struct inode		*inode,
> -	loff_t			offset,
> -	unsigned int		*new_type)
> -{
> -	struct xfs_inode	*ip = XFS_I(inode);
> -	struct xfs_bmbt_irec	imap;
> -	bool			is_cow = false;
> -	int			error;
> -
> -	/*
> -	 * If we already have a valid COW mapping keep using it.
> -	 */
> -	if (wpc->io_type == XFS_IO_COW) {
> -		wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
> -		if (wpc->imap_valid) {
> -			*new_type = XFS_IO_COW;
> -			return 0;
> -		}
> -	}
> -
> -	/*
> -	 * Else we need to check if there is a COW mapping at this offset.
> -	 */
> -	xfs_ilock(ip, XFS_ILOCK_SHARED);
> -	is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
> -	xfs_iunlock(ip, XFS_ILOCK_SHARED);
> -
> -	if (!is_cow)
> -		return 0;
> -
> -	/*
> -	 * And if the COW mapping has a delayed extent here we need to
> -	 * allocate real space for it now.
> -	 */
> -	if (isnullstartblock(imap.br_startblock)) {
> -		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
> -				&imap);
> -		if (error)
> -			return error;
> -	}
> -
> -	wpc->io_type = *new_type = XFS_IO_COW;
> -	wpc->imap_valid = true;
> -	wpc->imap = imap;
> -	return 0;
> -}
> -
>  /*
>   * We implement an immediate ioend submission policy here to avoid needing to
>   * chain multiple ioends and hence nest mempool allocations which can violate
> @@ -873,10 +860,13 @@ xfs_writepage_map(
>  			continue;
>  		}
>  
> -		if (xfs_is_reflink_inode(XFS_I(inode))) {
> -			error = xfs_map_cow(wpc, inode, offset, &new_type);
> -			if (error)
> -				goto out;
> +		/*
> +		 * If we already have a valid COW mapping keep using it.
> +		 */
> +		if (wpc->io_type == XFS_IO_COW &&
> +		    xfs_imap_valid(inode, &wpc->imap, offset)) {
> +			wpc->imap_valid = true;
> +			new_type = XFS_IO_COW;
>  		}

So this lifts the cow I/O type check from xfs_map_cow(). That path would
set imap_valid and new_type and return, which essentially looks like a
"blocks found" case where we go ahead and add the buffer to the ioend.
Now we set the same values, but we have to call xfs_map_blocks() to make
sure we cover the cow blocks overlap case that xfs_map_cow() used to
handle.

Shouldn't passing this check effectively jump straight to to the buffer
add? Otherwise it looks like we unconditionally attempt to allocate
blocks in the cow fork.

Alternatively...

>  
>  		if (wpc->io_type != new_type) {
> @@ -887,22 +877,22 @@ xfs_writepage_map(
>  		if (wpc->imap_valid)
>  			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
>  							 offset);
> -		if (!wpc->imap_valid) {
> -			error = xfs_map_blocks(inode, offset, &wpc->imap,
> -					     wpc->io_type);
> +		if (!wpc->imap_valid || xfs_is_reflink_inode(XFS_I(inode))) {
> +			error = xfs_map_blocks(wpc, inode, offset);

... perhaps this could change to something like:

		if (!wpc->imap_valid ||
		    (xfs_is_reflink_inode(XFS_I(inode)) &&
		     wpc->io_type != XFS_IO_COW))
			error = ...

... so the logic is a bit more clear. E.g., we always need to look up in
the COW fork for reflink inodes unless we're already doing COW I/O. Hm?

Brian

>  			if (error)
>  				goto out;
>  			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
>  							 offset);
>  		}
> -		if (wpc->imap_valid) {
> -			lock_buffer(bh);
> -			if (wpc->io_type != XFS_IO_OVERWRITE)
> -				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
> -			xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
> -			count++;
> -		}
>  
> +		if (!wpc->imap_valid || wpc->io_type == XFS_IO_HOLE)
> +			continue;
> +
> +		lock_buffer(bh);
> +		if (wpc->io_type != XFS_IO_OVERWRITE)
> +			xfs_map_at_offset(inode, bh, &wpc->imap, offset);
> +		xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
> +		count++;
>  	} while (offset += len, ((bh = bh->b_this_page) != head));
>  
>  	ASSERT(wpc->ioend || list_empty(&submit_list));
> diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> index 69346d460dfa..b2ef5b661761 100644
> --- a/fs/xfs/xfs_aops.h
> +++ b/fs/xfs/xfs_aops.h
> @@ -29,6 +29,7 @@ enum {
>  	XFS_IO_UNWRITTEN,	/* covers allocated but uninitialized data */
>  	XFS_IO_OVERWRITE,	/* covers already allocated extent */
>  	XFS_IO_COW,		/* covers copy-on-write extent */
> +	XFS_IO_HOLE,		/* covers region without any block allocation */
>  };
>  
>  #define XFS_IO_TYPES \
> @@ -36,7 +37,8 @@ enum {
>  	{ XFS_IO_DELALLOC,		"delalloc" }, \
>  	{ XFS_IO_UNWRITTEN,		"unwritten" }, \
>  	{ XFS_IO_OVERWRITE,		"overwrite" }, \
> -	{ XFS_IO_COW,			"CoW" }
> +	{ XFS_IO_COW,			"CoW" }, \
> +	{ XFS_IO_HOLE,			"hole" }
>  
>  /*
>   * Structure for buffered I/O completions.
> -- 
> 2.17.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html