On Fri, Oct 07, 2016 at 02:04:15PM -0400, Brian Foster wrote: > On Thu, Sep 29, 2016 at 08:10:05PM -0700, Darrick J. Wong wrote: > > Reflink extents from one file to another; that is to say, iteratively > > remove the mappings from the destination file, copy the mappings from > > the source file to the destination file, and increment the reference > > count of all the blocks that got remapped. > > > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > > --- > > v2: Call xfs_defer_cancel before cancelling the transaction if the > > remap operation fails. Use the deferred operations system to avoid > > deadlocks or blowing out the transaction reservation, and make the > > entire reflink operation atomic for each extent being remapped. The > > destination file's i_size will be updated if necessary to avoid > > violating the assumption that there are no shared blocks past the EOF > > block. > > --- > > fs/xfs/xfs_reflink.c | 425 ++++++++++++++++++++++++++++++++++++++++++++++++++ > > fs/xfs/xfs_reflink.h | 2 > > 2 files changed, 427 insertions(+) > > > > > > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c > > index 673ecc1..94c19fff 100644 > > --- a/fs/xfs/xfs_reflink.c > > +++ b/fs/xfs/xfs_reflink.c > > @@ -922,3 +922,428 @@ xfs_reflink_recover_cow( > > > > return error; > > } > ... > > +/* > > + * Unmap a range of blocks from a file, then map other blocks into the hole. > > + * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). > > + * The extent irec is mapped into dest at irec->br_startoff. > > + */ > > +STATIC int > > +xfs_reflink_remap_extent( > > + struct xfs_inode *ip, > > + struct xfs_bmbt_irec *irec, > > + xfs_fileoff_t destoff, > > + xfs_off_t new_isize) > > +{ > > + struct xfs_mount *mp = ip->i_mount; > > + struct xfs_trans *tp; > > + xfs_fsblock_t firstfsb; > > + unsigned int resblks; > > + struct xfs_defer_ops dfops; > > + struct xfs_bmbt_irec uirec; > > + bool real_extent; > > + xfs_filblks_t rlen; > > + xfs_filblks_t unmap_len; > > + xfs_off_t newlen; > > + int error; > > + > > + unmap_len = irec->br_startoff + irec->br_blockcount - destoff; > > + trace_xfs_reflink_punch_range(ip, destoff, unmap_len); > > + > > + /* Only remap normal extents. */ > > + real_extent = (irec->br_startblock != HOLESTARTBLOCK && > > + irec->br_startblock != DELAYSTARTBLOCK && > > + !ISUNWRITTEN(irec)); > > + > > + /* Start a rolling transaction to switch the mappings */ > > + resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); > > + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); > > + if (error) > > + goto out; > > + > > + xfs_ilock(ip, XFS_ILOCK_EXCL); > > + xfs_trans_ijoin(tp, ip, 0); > > + > > + /* If we're not just clearing space, then do we have enough quota? */ > > + if (real_extent) { > > + error = xfs_trans_reserve_quota_nblks(tp, ip, > > + irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); > > + if (error) > > + goto out_cancel; > > + } > > + > > + trace_xfs_reflink_remap(ip, irec->br_startoff, > > + irec->br_blockcount, irec->br_startblock); > > + > > + /* Unmap the old blocks in the data fork. */ > > + rlen = unmap_len; > > + while (rlen) { > > + xfs_defer_init(&dfops, &firstfsb); > > + error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1, > > + &firstfsb, &dfops); > > + if (error) > > + goto out_defer; > > + > > + /* Trim the extent to whatever got unmapped. */ > > + uirec = *irec; > > + xfs_trim_extent(&uirec, destoff + rlen, unmap_len - rlen); > > + unmap_len = rlen; > > + > > + /* If this isn't a real mapping, we're done. */ > > + if (!real_extent || uirec.br_blockcount == 0) > > + goto next_extent; > > + > > Any reason we couldn't reuse existing mechanisms for this? E.g., hole > punch the dest file range before we remap the source file extents. That > might change behavior in the event of a partial/failed reflink, but it's > not clear to me that matters. It matters a lot for the dedupe operation -- the unmap and remap operations must be atomic with each other so that if the dedupe operation fails, the user will still see the same file contents after reboot/recovery. We don't want users to find their files suddenly full of zeroes. For reflink I suspect that you're right, but we already guarantee that the user sees either the old contents or the new contents, so yay. :) > > > + trace_xfs_reflink_remap(ip, uirec.br_startoff, > > + uirec.br_blockcount, uirec.br_startblock); > > + > ... > > +} > > + > > +/* > > + * Iteratively remap one file's extents (and holes) to another's. > > + */ > > +STATIC int > > +xfs_reflink_remap_blocks( > > + struct xfs_inode *src, > > + xfs_fileoff_t srcoff, > > + struct xfs_inode *dest, > > + xfs_fileoff_t destoff, > > + xfs_filblks_t len, > > + xfs_off_t new_isize) > > +{ > > + struct xfs_bmbt_irec imap; > > + int nimaps; > > + int error = 0; > > + xfs_filblks_t range_len; > > + > > + /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ > > + while (len) { > > + trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, > > + dest, destoff); > > + /* Read extent from the source file */ > > + nimaps = 1; > > + xfs_ilock(src, XFS_ILOCK_EXCL); > > + error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); > > + xfs_iunlock(src, XFS_ILOCK_EXCL); > > + if (error) > > + goto err; > > + ASSERT(nimaps == 1); > > + > > + trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, > > + &imap); > > + > > + /* Translate imap into the destination file. */ > > + range_len = imap.br_startoff + imap.br_blockcount - srcoff; > > + imap.br_startoff += destoff - srcoff; > > + > > Just FYI... these are all unsigned vars... Yeah. It should handle that correctly. See generic/30[34]. --D > > Brian > > > + /* Clear dest from destoff to the end of imap and map it in. */ > > + error = xfs_reflink_remap_extent(dest, &imap, destoff, > > + new_isize); > > + if (error) > > + goto err; > > + > > + if (fatal_signal_pending(current)) { > > + error = -EINTR; > > + goto err; > > + } > > + > > + /* Advance drange/srange */ > > + srcoff += range_len; > > + destoff += range_len; > > + len -= range_len; > > + } > > + > > + return 0; > > + > > +err: > > + trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); > > + return error; > > +} > > + > > +/* > > + * Link a range of blocks from one file to another. > > + */ > > +int > > +xfs_reflink_remap_range( > > + struct xfs_inode *src, > > + xfs_off_t srcoff, > > + struct xfs_inode *dest, > > + xfs_off_t destoff, > > + xfs_off_t len) > > +{ > > + struct xfs_mount *mp = src->i_mount; > > + xfs_fileoff_t sfsbno, dfsbno; > > + xfs_filblks_t fsblen; > > + int error; > > + > > + if (!xfs_sb_version_hasreflink(&mp->m_sb)) > > + return -EOPNOTSUPP; > > + > > + if (XFS_FORCED_SHUTDOWN(mp)) > > + return -EIO; > > + > > + /* Don't reflink realtime inodes */ > > + if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) > > + return -EINVAL; > > + > > + trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff); > > + > > + /* Lock both files against IO */ > > + if (src->i_ino == dest->i_ino) { > > + xfs_ilock(src, XFS_IOLOCK_EXCL); > > + xfs_ilock(src, XFS_MMAPLOCK_EXCL); > > + } else { > > + xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); > > + xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); > > + } > > + > > + error = xfs_reflink_set_inode_flag(src, dest); > > + if (error) > > + goto out_error; > > + > > + /* > > + * Invalidate the page cache so that we can clear any CoW mappings > > + * in the destination file. > > + */ > > + truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff, > > + PAGE_ALIGN(destoff + len) - 1); > > + > > + dfsbno = XFS_B_TO_FSBT(mp, destoff); > > + sfsbno = XFS_B_TO_FSBT(mp, srcoff); > > + fsblen = XFS_B_TO_FSB(mp, len); > > + error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, > > + destoff + len); > > + if (error) > > + goto out_error; > > + > > + error = xfs_reflink_update_dest(dest, destoff + len); > > + if (error) > > + goto out_error; > > + > > +out_error: > > + xfs_iunlock(src, XFS_MMAPLOCK_EXCL); > > + xfs_iunlock(src, XFS_IOLOCK_EXCL); > > + if (src->i_ino != dest->i_ino) { > > + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); > > + xfs_iunlock(dest, XFS_IOLOCK_EXCL); > > + } > > + if (error) > > + trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_); > > + return error; > > +} > > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h > > index 1d2f180..c35ce29 100644 > > --- a/fs/xfs/xfs_reflink.h > > +++ b/fs/xfs/xfs_reflink.h > > @@ -43,5 +43,7 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, > > extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, > > xfs_off_t count); > > extern int xfs_reflink_recover_cow(struct xfs_mount *mp); > > +extern int xfs_reflink_remap_range(struct xfs_inode *src, xfs_off_t srcoff, > > + struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len); > > > > #endif /* __XFS_REFLINK_H */ > > > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@xxxxxxxxxxxxxxx > > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html