Re: [PATCH 41/63] xfs: reflink extents from one file to another

Brian Foster <bfoster@xxxxxxxxxx> · Mon, 10 Oct 2016 09:17:09 -0400

On Fri, Oct 07, 2016 at 02:41:50PM -0700, Darrick J. Wong wrote:
> On Fri, Oct 07, 2016 at 04:48:52PM -0400, Brian Foster wrote:
> > On Fri, Oct 07, 2016 at 12:44:30PM -0700, Darrick J. Wong wrote:
> > > On Fri, Oct 07, 2016 at 02:04:15PM -0400, Brian Foster wrote:
> > > > On Thu, Sep 29, 2016 at 08:10:05PM -0700, Darrick J. Wong wrote:
> > > > > Reflink extents from one file to another; that is to say, iteratively
> > > > > remove the mappings from the destination file, copy the mappings from
> > > > > the source file to the destination file, and increment the reference
> > > > > count of all the blocks that got remapped.
> > > > > 
> > > > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> > > > > ---
> > > > > v2: Call xfs_defer_cancel before cancelling the transaction if the
> > > > > remap operation fails.  Use the deferred operations system to avoid
> > > > > deadlocks or blowing out the transaction reservation, and make the
> > > > > entire reflink operation atomic for each extent being remapped.  The
> > > > > destination file's i_size will be updated if necessary to avoid
> > > > > violating the assumption that there are no shared blocks past the EOF
> > > > > block.
> > > > > ---
> > > > >  fs/xfs/xfs_reflink.c |  425 ++++++++++++++++++++++++++++++++++++++++++++++++++
> > > > >  fs/xfs/xfs_reflink.h |    2 
> > > > >  2 files changed, 427 insertions(+)
> > > > > 
> > > > > 
> > > > > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > > > > index 673ecc1..94c19fff 100644
> > > > > --- a/fs/xfs/xfs_reflink.c
> > > > > +++ b/fs/xfs/xfs_reflink.c
> > > > > @@ -922,3 +922,428 @@ xfs_reflink_recover_cow(
> > > > >  
> > > > >  	return error;
> > > > >  }
> > > > ...
> > > > > +/*
> > > > > + * Unmap a range of blocks from a file, then map other blocks into the hole.
> > > > > + * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
> > > > > + * The extent irec is mapped into dest at irec->br_startoff.
> > > > > + */
> > > > > +STATIC int
> > > > > +xfs_reflink_remap_extent(
> > > > > +	struct xfs_inode	*ip,
> > > > > +	struct xfs_bmbt_irec	*irec,
> > > > > +	xfs_fileoff_t		destoff,
> > > > > +	xfs_off_t		new_isize)
> > > > > +{
> > > > > +	struct xfs_mount	*mp = ip->i_mount;
> > > > > +	struct xfs_trans	*tp;
> > > > > +	xfs_fsblock_t		firstfsb;
> > > > > +	unsigned int		resblks;
> > > > > +	struct xfs_defer_ops	dfops;
> > > > > +	struct xfs_bmbt_irec	uirec;
> > > > > +	bool			real_extent;
> > > > > +	xfs_filblks_t		rlen;
> > > > > +	xfs_filblks_t		unmap_len;
> > > > > +	xfs_off_t		newlen;
> > > > > +	int			error;
> > > > > +
> > > > > +	unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
> > > > > +	trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
> > > > > +
> > > > > +	/* Only remap normal extents. */
> > > > > +	real_extent =  (irec->br_startblock != HOLESTARTBLOCK &&
> > > > > +			irec->br_startblock != DELAYSTARTBLOCK &&
> > > > > +			!ISUNWRITTEN(irec));
> > > > > +
> > > > > +	/* Start a rolling transaction to switch the mappings */
> > > > > +	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
> > > > > +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
> > > > > +	if (error)
> > > > > +		goto out;
> > > > > +
> > > > > +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> > > > > +	xfs_trans_ijoin(tp, ip, 0);
> > > > > +
> > > > > +	/* If we're not just clearing space, then do we have enough quota? */
> > > > > +	if (real_extent) {
> > > > > +		error = xfs_trans_reserve_quota_nblks(tp, ip,
> > > > > +				irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
> > > > > +		if (error)
> > > > > +			goto out_cancel;
> > > > > +	}
> > > > > +
> > > > > +	trace_xfs_reflink_remap(ip, irec->br_startoff,
> > > > > +				irec->br_blockcount, irec->br_startblock);
> > > > > +
> > > > > +	/* Unmap the old blocks in the data fork. */
> > > > > +	rlen = unmap_len;
> > > > > +	while (rlen) {
> > > > > +		xfs_defer_init(&dfops, &firstfsb);
> > > > > +		error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1,
> > > > > +				&firstfsb, &dfops);
> > > > > +		if (error)
> > > > > +			goto out_defer;
> > > > > +
> > > > > +		/* Trim the extent to whatever got unmapped. */
> > > > > +		uirec = *irec;
> > > > > +		xfs_trim_extent(&uirec, destoff + rlen, unmap_len - rlen);
> > > > > +		unmap_len = rlen;
> > > > > +
> > > > > +		/* If this isn't a real mapping, we're done. */
> > > > > +		if (!real_extent || uirec.br_blockcount == 0)
> > > > > +			goto next_extent;
> > > > > +
> > > > 
> > > > Any reason we couldn't reuse existing mechanisms for this? E.g., hole
> > > > punch the dest file range before we remap the source file extents. That
> > > > might change behavior in the event of a partial/failed reflink, but it's
> > > > not clear to me that matters.
> > > 
> > > It matters a lot for the dedupe operation -- the unmap and remap
> > > operations must be atomic with each other so that if the dedupe
> > > operation fails, the user will still see the same file contents after
> > > reboot/recovery.  We don't want users to find their files suddenly full
> > > of zeroes.
> > > 
> > 
> > Ok, that makes sense. Though the dedup atomicity is provided simply by
> > doing each unmap/remap within the same transaction, right? I'm kind of
> 
> The unmap/remap are done within the same defer_ops, but different transactions.
> 

Ok.

> > wondering if we could do something like refactor/reuse
> > xfs_unmap_extent(), pull the trans alloc/commit and the unmap call up
> > into xfs_reflink_remap_blocks(), then clean out
> > xfs_reflink_remap_extent() a bit as a result.
> 
> Hm.  Let's start with the current structure:
> 
> for each extent in the source file,
>   alloc transaction
>   for each extent in the dest file that bunmapi tells us is now empty,
>     log refcount increase intent
>     log bmap remap intent
>     update quota
>     update isize if needed
>     _defer_finish
>   commit transaction
> 
> You could flatten _remap_extent and _remap_blocks into a single
> function with a double loop, I suppose.  I don't think trying to reuse
> _unmap_extent buys us much, however -- for the truncate case we simply
> unmapi and _defer_finish, but for reflink we have all those extra steps
> that have to go between the bunmapi and the defer_finish.  Furthermore
> we still have to use __xfs_bunmapi for reflink because we have to know
> exactly which part to remap since we can only unmap one extent per
> transaction.
> 

Hmm, Ok. I was really just aiming for some cleanup/reuse, but the
requirements here might make it not worthwhile.

> > But meh, this stuff is already merged so maybe I should just send a
> > patch. :P
> 
> That said, if you send a patch I'll have a look. :)
> 

I'll play around with it when I have a chance. If nothing else it will
probably help me understand it better. ;) Thanks.

Brian

> --D
> 
> > 
> > Brian
> > 
> > > For reflink I suspect that you're right, but we already guarantee that
> > > the user sees either the old contents or the new contents, so yay. :)
> > > 
> > > > 
> > > > > +		trace_xfs_reflink_remap(ip, uirec.br_startoff,
> > > > > +				uirec.br_blockcount, uirec.br_startblock);
> > > > > +
> > > > ...
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Iteratively remap one file's extents (and holes) to another's.
> > > > > + */
> > > > > +STATIC int
> > > > > +xfs_reflink_remap_blocks(
> > > > > +	struct xfs_inode	*src,
> > > > > +	xfs_fileoff_t		srcoff,
> > > > > +	struct xfs_inode	*dest,
> > > > > +	xfs_fileoff_t		destoff,
> > > > > +	xfs_filblks_t		len,
> > > > > +	xfs_off_t		new_isize)
> > > > > +{
> > > > > +	struct xfs_bmbt_irec	imap;
> > > > > +	int			nimaps;
> > > > > +	int			error = 0;
> > > > > +	xfs_filblks_t		range_len;
> > > > > +
> > > > > +	/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
> > > > > +	while (len) {
> > > > > +		trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
> > > > > +				dest, destoff);
> > > > > +		/* Read extent from the source file */
> > > > > +		nimaps = 1;
> > > > > +		xfs_ilock(src, XFS_ILOCK_EXCL);
> > > > > +		error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
> > > > > +		xfs_iunlock(src, XFS_ILOCK_EXCL);
> > > > > +		if (error)
> > > > > +			goto err;
> > > > > +		ASSERT(nimaps == 1);
> > > > > +
> > > > > +		trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
> > > > > +				&imap);
> > > > > +
> > > > > +		/* Translate imap into the destination file. */
> > > > > +		range_len = imap.br_startoff + imap.br_blockcount - srcoff;
> > > > > +		imap.br_startoff += destoff - srcoff;
> > > > > +
> > > > 
> > > > Just FYI... these are all unsigned vars...
> > > 
> > > Yeah.  It should handle that correctly.  See generic/30[34].
> > > 
> > > --D
> > > 
> > > > 
> > > > Brian
> > > > 
> > > > > +		/* Clear dest from destoff to the end of imap and map it in. */
> > > > > +		error = xfs_reflink_remap_extent(dest, &imap, destoff,
> > > > > +				new_isize);
> > > > > +		if (error)
> > > > > +			goto err;
> > > > > +
> > > > > +		if (fatal_signal_pending(current)) {
> > > > > +			error = -EINTR;
> > > > > +			goto err;
> > > > > +		}
> > > > > +
> > > > > +		/* Advance drange/srange */
> > > > > +		srcoff += range_len;
> > > > > +		destoff += range_len;
> > > > > +		len -= range_len;
> > > > > +	}
> > > > > +
> > > > > +	return 0;
> > > > > +
> > > > > +err:
> > > > > +	trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
> > > > > +	return error;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Link a range of blocks from one file to another.
> > > > > + */
> > > > > +int
> > > > > +xfs_reflink_remap_range(
> > > > > +	struct xfs_inode	*src,
> > > > > +	xfs_off_t		srcoff,
> > > > > +	struct xfs_inode	*dest,
> > > > > +	xfs_off_t		destoff,
> > > > > +	xfs_off_t		len)
> > > > > +{
> > > > > +	struct xfs_mount	*mp = src->i_mount;
> > > > > +	xfs_fileoff_t		sfsbno, dfsbno;
> > > > > +	xfs_filblks_t		fsblen;
> > > > > +	int			error;
> > > > > +
> > > > > +	if (!xfs_sb_version_hasreflink(&mp->m_sb))
> > > > > +		return -EOPNOTSUPP;
> > > > > +
> > > > > +	if (XFS_FORCED_SHUTDOWN(mp))
> > > > > +		return -EIO;
> > > > > +
> > > > > +	/* Don't reflink realtime inodes */
> > > > > +	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff);
> > > > > +
> > > > > +	/* Lock both files against IO */
> > > > > +	if (src->i_ino == dest->i_ino) {
> > > > > +		xfs_ilock(src, XFS_IOLOCK_EXCL);
> > > > > +		xfs_ilock(src, XFS_MMAPLOCK_EXCL);
> > > > > +	} else {
> > > > > +		xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
> > > > > +		xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
> > > > > +	}
> > > > > +
> > > > > +	error = xfs_reflink_set_inode_flag(src, dest);
> > > > > +	if (error)
> > > > > +		goto out_error;
> > > > > +
> > > > > +	/*
> > > > > +	 * Invalidate the page cache so that we can clear any CoW mappings
> > > > > +	 * in the destination file.
> > > > > +	 */
> > > > > +	truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff,
> > > > > +				   PAGE_ALIGN(destoff + len) - 1);
> > > > > +
> > > > > +	dfsbno = XFS_B_TO_FSBT(mp, destoff);
> > > > > +	sfsbno = XFS_B_TO_FSBT(mp, srcoff);
> > > > > +	fsblen = XFS_B_TO_FSB(mp, len);
> > > > > +	error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
> > > > > +			destoff + len);
> > > > > +	if (error)
> > > > > +		goto out_error;
> > > > > +
> > > > > +	error = xfs_reflink_update_dest(dest, destoff + len);
> > > > > +	if (error)
> > > > > +		goto out_error;
> > > > > +
> > > > > +out_error:
> > > > > +	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
> > > > > +	xfs_iunlock(src, XFS_IOLOCK_EXCL);
> > > > > +	if (src->i_ino != dest->i_ino) {
> > > > > +		xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
> > > > > +		xfs_iunlock(dest, XFS_IOLOCK_EXCL);
> > > > > +	}
> > > > > +	if (error)
> > > > > +		trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_);
> > > > > +	return error;
> > > > > +}
> > > > > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> > > > > index 1d2f180..c35ce29 100644
> > > > > --- a/fs/xfs/xfs_reflink.h
> > > > > +++ b/fs/xfs/xfs_reflink.h
> > > > > @@ -43,5 +43,7 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
> > > > >  extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
> > > > >  		xfs_off_t count);
> > > > >  extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
> > > > > +extern int xfs_reflink_remap_range(struct xfs_inode *src, xfs_off_t srcoff,
> > > > > +		struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len);
> > > > >  
> > > > >  #endif /* __XFS_REFLINK_H */
> > > > > 
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@xxxxxxxxxxxxxxx
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@xxxxxxxxxxxxxxx
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html