On Fri, Oct 07, 2016 at 02:04:05PM -0400, Brian Foster wrote: > On Thu, Sep 29, 2016 at 08:09:59PM -0700, Darrick J. Wong wrote: > > Due to the way the CoW algorithm in XFS works, there's an interval > > during which blocks allocated to handle a CoW can be lost -- if the FS > > goes down after the blocks are allocated but before the block > > remapping takes place. This is exacerbated by the cowextsz hint -- > > allocated reservations can sit around for a while, waiting to get > > used. > > > > Since the refcount btree doesn't normally store records with refcount > > of 1, we can use it to record these in-progress extents. In-progress > > blocks cannot be shared because they're not user-visible, so there > > shouldn't be any conflicts with other programs. This is a better > > solution than holding EFIs during writeback because (a) EFIs can't be > > relogged currently, (b) even if they could, EFIs are bound by > > available log space, which puts an unnecessary upper bound on how much > > CoW we can have in flight, and (c) we already have a mechanism to > > track blocks. > > > > At mount time, read the refcount records and free anything we find > > with a refcount of 1 because those were in-progress when the FS went > > down. > > > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > > --- > > v2: Use the deferred operations system to avoid deadlocks and blowing > > out the transaction reservation. This allows us to unmap a CoW > > extent from the refcountbt and into a file atomically. > > --- > > fs/xfs/libxfs/xfs_bmap.c | 11 + > > fs/xfs/libxfs/xfs_format.h | 3 > > fs/xfs/libxfs/xfs_refcount.c | 336 +++++++++++++++++++++++++++++++++++++++++- > > fs/xfs/libxfs/xfs_refcount.h | 10 + > > fs/xfs/xfs_mount.c | 12 ++ > > fs/xfs/xfs_refcount_item.c | 12 ++ > > fs/xfs/xfs_reflink.c | 150 +++++++++++++++++++ > > fs/xfs/xfs_reflink.h | 1 > > fs/xfs/xfs_super.c | 9 + > > fs/xfs/xfs_trace.h | 4 + > > 10 files changed, 537 insertions(+), 11 deletions(-) > > > > > ... > > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c > > index c95cdc3..673ecc1 100644 > > --- a/fs/xfs/xfs_reflink.c > > +++ b/fs/xfs/xfs_reflink.c > ... > > @@ -772,3 +787,138 @@ xfs_reflink_end_cow( > > trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); > > return error; > > } > ... > > +STATIC int > > +xfs_reflink_recover_cow_ag( > > + struct xfs_mount *mp, > > + xfs_agnumber_t agno) > > +{ > > + struct xfs_trans *tp; > > + struct xfs_btree_cur *cur; > > + struct xfs_buf *agbp; > > + struct xfs_reflink_recovery *rr, *n; > > + struct list_head debris; > > + union xfs_btree_irec low; > > + union xfs_btree_irec high; > > + struct xfs_defer_ops dfops; > > + xfs_fsblock_t fsb; > > + int error; > > + > > + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); > > + if (error) > > + return error; > > + cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); > > + > > + /* Find all the leftover CoW staging extents. */ > > + INIT_LIST_HEAD(&debris); > > + memset(&low, 0, sizeof(low)); > > + memset(&high, 0, sizeof(high)); > > + low.rc.rc_startblock = 0; > > + high.rc.rc_startblock = -1U; > > + error = xfs_btree_query_range(cur, &low, &high, > > + xfs_reflink_recover_extent, &debris); > > + if (error) > > + goto out_error; > > Potential memory leak of debris list entries on error. Ugh, the error handling in this whole function needs reworking. > > + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); > > + xfs_buf_relse(agbp); > > + > > + /* Now iterate the list to free the leftovers */ > > + list_for_each_entry(rr, &debris, rr_list) { > > + /* Set up transaction. */ > > + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); > > + if (error) > > + goto out_free; > > + > > + trace_xfs_reflink_recover_extent(mp, agno, &rr->rr_rrec); > > + > > + /* Free the orphan record */ > > + xfs_defer_init(&dfops, &fsb); > > + fsb = XFS_AGB_TO_FSB(mp, agno, rr->rr_rrec.rc_startblock); > > + error = xfs_refcount_free_cow_extent(mp, &dfops, fsb, > > + rr->rr_rrec.rc_blockcount); > > + if (error) > > + goto out_defer; > > + > > + /* Free the block. */ > > + xfs_bmap_add_free(mp, &dfops, fsb, > > + rr->rr_rrec.rc_blockcount, NULL); > > + > > + error = xfs_defer_finish(&tp, &dfops, NULL); > > + if (error) > > + goto out_defer; > > + > > + error = xfs_trans_commit(tp); > > + if (error) > > + goto out_cancel; > > + } > > + goto out_free; > > return 0 ? Yep. Good catch. --D > > Brian > > > + > > +out_defer: > > + xfs_defer_cancel(&dfops); > > +out_cancel: > > + xfs_trans_cancel(tp); > > + > > +out_free: > > + /* Free the leftover list */ > > + list_for_each_entry_safe(rr, n, &debris, rr_list) { > > + list_del(&rr->rr_list); > > + kmem_free(rr); > > + } > > + > > + return error; > > + > > +out_error: > > + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); > > + xfs_buf_relse(agbp); > > + return error; > > +} > > + > > +/* > > + * Free leftover CoW reservations that didn't get cleaned out. > > + */ > > +int > > +xfs_reflink_recover_cow( > > + struct xfs_mount *mp) > > +{ > > + xfs_agnumber_t agno; > > + int error = 0; > > + > > + if (!xfs_sb_version_hasreflink(&mp->m_sb)) > > + return 0; > > + > > + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { > > + error = xfs_reflink_recover_cow_ag(mp, agno); > > + if (error) > > + break; > > + } > > + > > + return error; > > +} > > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h > > index c0c989a..1d2f180 100644 > > --- a/fs/xfs/xfs_reflink.h > > +++ b/fs/xfs/xfs_reflink.h > > @@ -42,5 +42,6 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, > > xfs_off_t count); > > extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, > > xfs_off_t count); > > +extern int xfs_reflink_recover_cow(struct xfs_mount *mp); > > > > #endif /* __XFS_REFLINK_H */ > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > > index 26b45b3..e6aaa91 100644 > > --- a/fs/xfs/xfs_super.c > > +++ b/fs/xfs/xfs_super.c > > @@ -1306,6 +1306,15 @@ xfs_fs_remount( > > xfs_restore_resvblks(mp); > > xfs_log_work_queue(mp); > > xfs_queue_eofblocks(mp); > > + > > + /* Recover any CoW blocks that never got remapped. */ > > + error = xfs_reflink_recover_cow(mp); > > + if (error) { > > + xfs_err(mp, > > + "Error %d recovering leftover CoW allocations.", error); > > + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); > > + return error; > > + } > > } > > > > /* rw -> ro */ > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > > index 8e89223..ca0930b 100644 > > --- a/fs/xfs/xfs_trace.h > > +++ b/fs/xfs/xfs_trace.h > > @@ -2916,14 +2916,18 @@ DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error); > > /* refcount adjustment tracepoints */ > > DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase); > > DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease); > > +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase); > > +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease); > > DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents); > > DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent); > > +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_reflink_recover_extent); > > DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent); > > DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent); > > DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent); > > DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent); > > DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent); > > DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error); > > +DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error); > > DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error); > > DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error); > > DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error); > > > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@xxxxxxxxxxxxxxx > > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html