On Mon, Jun 22, 2020 at 09:01:35PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > > The existing reflink remapping loop has some structural problems that > need addressing: > > The biggest problem is that we create one transaction for each extent in > the source file without accounting for the number of mappings there are > for the same range in the destination file. In other words, we don't > know the number of remap operations that will be necessary and we > therefore cannot guess the block reservation required. On highly > fragmented filesystems (e.g. ones with active dedupe) we guess wrong, > run out of block reservation, and fail. > > The second problem is that we don't actually use the bmap intents to > their full potential -- instead of calling bunmapi directly and having > to deal with its backwards operation, we could call the deferred ops > xfs_bmap_unmap_extent and xfs_refcount_decrease_extent instead. This > makes the frontend loop much simpler. > > A third (and more minor) problem is that we aren't even clever enough to > skip the whole remapping thing if the source and destination ranges > point to the same physical extents. > I'm wondering if this all really has to be done in one patch. The last bit at least sounds like an optimization from the description. > Solve all of these problems by refactoring the remapping loops so that > we only perform one remapping operation per transaction, and each > operation only tries to remap a single extent from source to dest. > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > --- > fs/xfs/libxfs/xfs_bmap.h | 13 ++- > fs/xfs/xfs_reflink.c | 234 ++++++++++++++++++++++++++-------------------- > fs/xfs/xfs_trace.h | 52 +--------- > fs/xfs/xfs_trans_dquot.c | 13 +-- > 4 files changed, 149 insertions(+), 163 deletions(-) > > > diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h > index 6028a3c825ba..3498b4f75f71 100644 > --- a/fs/xfs/libxfs/xfs_bmap.h > +++ b/fs/xfs/libxfs/xfs_bmap.h > @@ -158,6 +158,13 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) > { BMAP_ATTRFORK, "ATTR" }, \ > { BMAP_COWFORK, "COW" } > > +/* Return true if the extent is an allocated extent, written or not. */ > +static inline bool xfs_bmap_is_mapped_extent(struct xfs_bmbt_irec *irec) > +{ > + return irec->br_startblock != HOLESTARTBLOCK && > + irec->br_startblock != DELAYSTARTBLOCK && > + !isnullstartblock(irec->br_startblock); > +} Heh, I was going to suggest to call this _is_real_extent(), but I see the helper below already uses that name. I do think the name correlates better with this helper and perhaps the one below should be called something like xfs_bmap_is_written_extent(). Hm? Either way, the "mapped" name is kind of vague and brings back memories of buffer heads. > > /* > * Return true if the extent is a real, allocated extent, or false if it is a > @@ -165,10 +172,8 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) > */ > static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) > { > - return irec->br_state != XFS_EXT_UNWRITTEN && > - irec->br_startblock != HOLESTARTBLOCK && > - irec->br_startblock != DELAYSTARTBLOCK && > - !isnullstartblock(irec->br_startblock); > + return xfs_bmap_is_mapped_extent(irec) && > + irec->br_state != XFS_EXT_UNWRITTEN; > } > > /* > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c > index 107bf2a2f344..f50a8c2f21a5 100644 > --- a/fs/xfs/xfs_reflink.c > +++ b/fs/xfs/xfs_reflink.c > @@ -984,40 +984,27 @@ xfs_reflink_ag_has_free_space( > } > > /* > - * Unmap a range of blocks from a file, then map other blocks into the hole. > - * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). > - * The extent irec is mapped into dest at irec->br_startoff. > + * Remap the given extent into the file at the given offset. The imap > + * blockcount will be set to the number of blocks that were actually remapped. > */ > STATIC int > xfs_reflink_remap_extent( > struct xfs_inode *ip, > - struct xfs_bmbt_irec *irec, > - xfs_fileoff_t destoff, > + struct xfs_bmbt_irec *imap, > xfs_off_t new_isize) > { > + struct xfs_bmbt_irec imap2; > struct xfs_mount *mp = ip->i_mount; > - bool real_extent = xfs_bmap_is_real_extent(irec); > struct xfs_trans *tp; > - unsigned int resblks; > - struct xfs_bmbt_irec uirec; > - xfs_filblks_t rlen; > - xfs_filblks_t unmap_len; > xfs_off_t newlen; > + int64_t ip_delta = 0; > + unsigned int resblks; > + bool real_extent = xfs_bmap_is_real_extent(imap); > + int nimaps; > int error; > > - unmap_len = irec->br_startoff + irec->br_blockcount - destoff; > - trace_xfs_reflink_punch_range(ip, destoff, unmap_len); > - > - /* No reflinking if we're low on space */ > - if (real_extent) { > - error = xfs_reflink_ag_has_free_space(mp, > - XFS_FSB_TO_AGNO(mp, irec->br_startblock)); > - if (error) > - goto out; > - } > - > /* Start a rolling transaction to switch the mappings */ > - resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); > + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); > error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); > if (error) > goto out; > @@ -1025,87 +1012,118 @@ xfs_reflink_remap_extent( > xfs_ilock(ip, XFS_ILOCK_EXCL); > xfs_trans_ijoin(tp, ip, 0); > > - /* If we're not just clearing space, then do we have enough quota? */ > + /* Read extent from the second file */ > + nimaps = 1; > + error = xfs_bmapi_read(ip, imap->br_startoff, imap->br_blockcount, > + &imap2, &nimaps, 0); > + if (error) > + goto out_cancel; > +#ifdef DEBUG > + if (nimaps != 1 || > + imap2.br_startoff != imap->br_startoff) { > + /* > + * We should never get no mapping or something that doesn't > + * match what we asked for. > + */ > + ASSERT(0); > + error = -EINVAL; > + goto out_cancel; > + } > +#endif Why the DEBUG? > + > + /* > + * We can only remap as many blocks as the smaller of the two extent > + * maps, because we can only remap one extent at a time. > + */ > + imap->br_blockcount = min(imap->br_blockcount, imap2.br_blockcount); > + > + trace_xfs_reflink_remap_extent2(ip, &imap2); > + > + /* > + * Two extents mapped to the same physical block must not have > + * different states; that's filesystem corruption. Move on to the next > + * extent if they're both holes or both the same physical extent. > + */ > + if (imap->br_startblock == imap2.br_startblock) { > + if (imap->br_state != imap2.br_state) > + error = -EFSCORRUPTED; Can we assert the length of each extent match here as well? > + goto out_cancel; > + } > + > if (real_extent) { > + /* No reflinking if we're low on space */ > + error = xfs_reflink_ag_has_free_space(mp, > + XFS_FSB_TO_AGNO(mp, imap->br_startblock)); > + if (error) > + goto out_cancel; > + > + > + /* Do we have enough quota? */ > error = xfs_trans_reserve_quota_nblks(tp, ip, > - irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); > + imap->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); Any reason this doesn't accommodate removal of real blocks? > if (error) > goto out_cancel; > } > > - trace_xfs_reflink_remap(ip, irec->br_startoff, > - irec->br_blockcount, irec->br_startblock); > - > - /* Unmap the old blocks in the data fork. */ > - rlen = unmap_len; > - while (rlen) { > - ASSERT(tp->t_firstblock == NULLFSBLOCK); > - error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1); > - if (error) > - goto out_cancel; > - > + if (xfs_bmap_is_mapped_extent(&imap2)) { > /* > - * Trim the extent to whatever got unmapped. > - * Remember, bunmapi works backwards. > + * If the extent we're unmapping is backed by storage (written > + * or not), unmap the extent and drop its refcount. > */ > - uirec.br_startblock = irec->br_startblock + rlen; > - uirec.br_startoff = irec->br_startoff + rlen; > - uirec.br_blockcount = unmap_len - rlen; > - uirec.br_state = irec->br_state; > - unmap_len = rlen; > + xfs_bmap_unmap_extent(tp, ip, &imap2); > + xfs_refcount_decrease_extent(tp, &imap2); > + ip_delta -= imap2.br_blockcount; > + } else if (imap2.br_startblock == DELAYSTARTBLOCK) { > + xfs_filblks_t len = imap2.br_blockcount; > > - /* If this isn't a real mapping, we're done. */ > - if (!real_extent || uirec.br_blockcount == 0) > - goto next_extent; > - > - trace_xfs_reflink_remap(ip, uirec.br_startoff, > - uirec.br_blockcount, uirec.br_startblock); > - > - /* Update the refcount tree */ > - xfs_refcount_increase_extent(tp, &uirec); > - > - /* Map the new blocks into the data fork. */ > - xfs_bmap_map_extent(tp, ip, &uirec); > + /* > + * If the extent we're unmapping is a delalloc reservation, > + * we can use the regular bunmapi function to release the > + * incore state. Dropping the delalloc reservation takes care > + * of the quota reservation for us. > + */ > + error = __xfs_bunmapi(NULL, ip, imap2.br_startoff, &len, 0, 1); > + if (error) > + goto out_cancel; > + ASSERT(len == 0); > + } > > - /* Update quota accounting. */ > - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, > - uirec.br_blockcount); > + /* > + * If the extent we're sharing is backed by written storage, increase > + * its refcount and map it into the file. > + */ > + if (real_extent) { > + xfs_refcount_increase_extent(tp, imap); > + xfs_bmap_map_extent(tp, ip, imap); > + ip_delta += imap->br_blockcount; > + } > > - /* Update dest isize if needed. */ > - newlen = XFS_FSB_TO_B(mp, > - uirec.br_startoff + uirec.br_blockcount); > - newlen = min_t(xfs_off_t, newlen, new_isize); > - if (newlen > i_size_read(VFS_I(ip))) { > - trace_xfs_reflink_update_inode_size(ip, newlen); > - i_size_write(VFS_I(ip), newlen); > - ip->i_d.di_size = newlen; > - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); > - } > + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, ip_delta); > > -next_extent: > - /* Process all the deferred stuff. */ > - error = xfs_defer_finish(&tp); > - if (error) > - goto out_cancel; > + /* Update dest isize if needed. */ > + newlen = XFS_FSB_TO_B(mp, imap2.br_startoff + imap2.br_blockcount); > + newlen = min_t(xfs_off_t, newlen, new_isize); > + if (newlen > i_size_read(VFS_I(ip))) { > + trace_xfs_reflink_update_inode_size(ip, newlen); > + i_size_write(VFS_I(ip), newlen); > + ip->i_d.di_size = newlen; > + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); > } > > + /* Commit everything and unlock. */ > error = xfs_trans_commit(tp); > - xfs_iunlock(ip, XFS_ILOCK_EXCL); > - if (error) > - goto out; > - return 0; > + goto out_unlock; We probably shouldn't fire the _error() tracepoint in the successful exit case. > > out_cancel: > xfs_trans_cancel(tp); > +out_unlock: > xfs_iunlock(ip, XFS_ILOCK_EXCL); > out: > trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); > return error; > } > > -/* > - * Iteratively remap one file's extents (and holes) to another's. > - */ > +/* Remap a range of one file to the other. */ > int > xfs_reflink_remap_blocks( > struct xfs_inode *src, > @@ -1116,25 +1134,22 @@ xfs_reflink_remap_blocks( > loff_t *remapped) > { > struct xfs_bmbt_irec imap; > - xfs_fileoff_t srcoff; > - xfs_fileoff_t destoff; > + struct xfs_mount *mp = src->i_mount; > + xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in); > + xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out); > xfs_filblks_t len; > - xfs_filblks_t range_len; > xfs_filblks_t remapped_len = 0; > xfs_off_t new_isize = pos_out + remap_len; > int nimaps; > int error = 0; > > - destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); > - srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); > - len = XFS_B_TO_FSB(src->i_mount, remap_len); > + len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len), > + XFS_MAX_FILEOFF); > > - /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ > - while (len) { > - uint lock_mode; > + trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff); > > - trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, > - dest, destoff); > + while (len > 0) { > + unsigned int lock_mode; > > /* Read extent from the source file */ > nimaps = 1; > @@ -1143,18 +1158,29 @@ xfs_reflink_remap_blocks( > xfs_iunlock(src, lock_mode); > if (error) > break; > - ASSERT(nimaps == 1); > +#ifdef DEBUG > + if (nimaps != 1 || > + imap.br_startblock == DELAYSTARTBLOCK || > + imap.br_startoff != srcoff) { > + /* > + * We should never get no mapping or a delalloc extent > + * or something that doesn't match what we asked for, > + * since the caller flushed the source file data and > + * we hold the source file io/mmap locks. > + */ > + ASSERT(nimaps == 0); > + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); > + ASSERT(imap.br_startoff == srcoff); > + error = -EINVAL; > + break; > + } > +#endif Same thing with the DEBUG check? It seems like at least some of these checks should be unconditional in the event the file is corrupted or something. > > - trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, > - &imap); > + trace_xfs_reflink_remap_extent1(src, &imap); How about trace_xfs_reflink_remap_extent_[src|dest]() so trace data is a bit more readable? > > - /* Translate imap into the destination file. */ > - range_len = imap.br_startoff + imap.br_blockcount - srcoff; > - imap.br_startoff += destoff - srcoff; > - > - /* Clear dest from destoff to the end of imap and map it in. */ > - error = xfs_reflink_remap_extent(dest, &imap, destoff, > - new_isize); > + /* Remap into the destination file at the given offset. */ > + imap.br_startoff = destoff; > + error = xfs_reflink_remap_extent(dest, &imap, new_isize); > if (error) > break; > > @@ -1164,10 +1190,10 @@ xfs_reflink_remap_blocks( > } > > /* Advance drange/srange */ > - srcoff += range_len; > - destoff += range_len; > - len -= range_len; > - remapped_len += range_len; > + srcoff += imap.br_blockcount; > + destoff += imap.br_blockcount; > + len -= imap.br_blockcount; > + remapped_len += imap.br_blockcount; Much nicer iteration, indeed. ;) Brian > } > > if (error) > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > index 460136628a79..f205602c8ba9 100644 > --- a/fs/xfs/xfs_trace.h > +++ b/fs/xfs/xfs_trace.h > @@ -3052,8 +3052,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ > DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); > DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); > DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); > -DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); > -TRACE_EVENT(xfs_reflink_remap_blocks_loop, > +TRACE_EVENT(xfs_reflink_remap_blocks, > TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, > xfs_filblks_t len, struct xfs_inode *dest, > xfs_fileoff_t doffset), > @@ -3084,59 +3083,14 @@ TRACE_EVENT(xfs_reflink_remap_blocks_loop, > __entry->dest_ino, > __entry->dest_lblk) > ); > -TRACE_EVENT(xfs_reflink_punch_range, > - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, > - xfs_extlen_t len), > - TP_ARGS(ip, lblk, len), > - TP_STRUCT__entry( > - __field(dev_t, dev) > - __field(xfs_ino_t, ino) > - __field(xfs_fileoff_t, lblk) > - __field(xfs_extlen_t, len) > - ), > - TP_fast_assign( > - __entry->dev = VFS_I(ip)->i_sb->s_dev; > - __entry->ino = ip->i_ino; > - __entry->lblk = lblk; > - __entry->len = len; > - ), > - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x", > - MAJOR(__entry->dev), MINOR(__entry->dev), > - __entry->ino, > - __entry->lblk, > - __entry->len) > -); > -TRACE_EVENT(xfs_reflink_remap, > - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, > - xfs_extlen_t len, xfs_fsblock_t new_pblk), > - TP_ARGS(ip, lblk, len, new_pblk), > - TP_STRUCT__entry( > - __field(dev_t, dev) > - __field(xfs_ino_t, ino) > - __field(xfs_fileoff_t, lblk) > - __field(xfs_extlen_t, len) > - __field(xfs_fsblock_t, new_pblk) > - ), > - TP_fast_assign( > - __entry->dev = VFS_I(ip)->i_sb->s_dev; > - __entry->ino = ip->i_ino; > - __entry->lblk = lblk; > - __entry->len = len; > - __entry->new_pblk = new_pblk; > - ), > - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu", > - MAJOR(__entry->dev), MINOR(__entry->dev), > - __entry->ino, > - __entry->lblk, > - __entry->len, > - __entry->new_pblk) > -); > DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); > DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); > DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); > DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); > DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); > DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); > +DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent1); > +DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent2); > > /* dedupe tracepoints */ > DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); > diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c > index c0f73b82c055..99511ff6222f 100644 > --- a/fs/xfs/xfs_trans_dquot.c > +++ b/fs/xfs/xfs_trans_dquot.c > @@ -124,16 +124,17 @@ xfs_trans_dup_dqinfo( > */ > void > xfs_trans_mod_dquot_byino( > - xfs_trans_t *tp, > - xfs_inode_t *ip, > - uint field, > - int64_t delta) > + struct xfs_trans *tp, > + struct xfs_inode *ip, > + uint field, > + int64_t delta) > { > - xfs_mount_t *mp = tp->t_mountp; > + struct xfs_mount *mp = tp->t_mountp; > > if (!XFS_IS_QUOTA_RUNNING(mp) || > !XFS_IS_QUOTA_ON(mp) || > - xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) > + xfs_is_quota_inode(&mp->m_sb, ip->i_ino) || > + delta == 0) > return; > > if (tp->t_dqinfo == NULL) >