Re: [PATCH 40/63] xfs: store in-progress CoW allocations in the refcount btree

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Oct 07, 2016 at 02:04:05PM -0400, Brian Foster wrote:
> On Thu, Sep 29, 2016 at 08:09:59PM -0700, Darrick J. Wong wrote:
> > Due to the way the CoW algorithm in XFS works, there's an interval
> > during which blocks allocated to handle a CoW can be lost -- if the FS
> > goes down after the blocks are allocated but before the block
> > remapping takes place.  This is exacerbated by the cowextsz hint --
> > allocated reservations can sit around for a while, waiting to get
> > used.
> > 
> > Since the refcount btree doesn't normally store records with refcount
> > of 1, we can use it to record these in-progress extents.  In-progress
> > blocks cannot be shared because they're not user-visible, so there
> > shouldn't be any conflicts with other programs.  This is a better
> > solution than holding EFIs during writeback because (a) EFIs can't be
> > relogged currently, (b) even if they could, EFIs are bound by
> > available log space, which puts an unnecessary upper bound on how much
> > CoW we can have in flight, and (c) we already have a mechanism to
> > track blocks.
> > 
> > At mount time, read the refcount records and free anything we find
> > with a refcount of 1 because those were in-progress when the FS went
> > down.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> > ---
> > v2: Use the deferred operations system to avoid deadlocks and blowing
> > out the transaction reservation.  This allows us to unmap a CoW
> > extent from the refcountbt and into a file atomically.
> > ---
> >  fs/xfs/libxfs/xfs_bmap.c     |   11 +
> >  fs/xfs/libxfs/xfs_format.h   |    3 
> >  fs/xfs/libxfs/xfs_refcount.c |  336 +++++++++++++++++++++++++++++++++++++++++-
> >  fs/xfs/libxfs/xfs_refcount.h |   10 +
> >  fs/xfs/xfs_mount.c           |   12 ++
> >  fs/xfs/xfs_refcount_item.c   |   12 ++
> >  fs/xfs/xfs_reflink.c         |  150 +++++++++++++++++++
> >  fs/xfs/xfs_reflink.h         |    1 
> >  fs/xfs/xfs_super.c           |    9 +
> >  fs/xfs/xfs_trace.h           |    4 +
> >  10 files changed, 537 insertions(+), 11 deletions(-)
> > 
> > 
> ...
> > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > index c95cdc3..673ecc1 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> ...
> > @@ -772,3 +787,138 @@ xfs_reflink_end_cow(
> >  	trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
> >  	return error;
> >  }
> ...
> > +STATIC int
> > +xfs_reflink_recover_cow_ag(
> > +	struct xfs_mount		*mp,
> > +	xfs_agnumber_t			agno)
> > +{
> > +	struct xfs_trans		*tp;
> > +	struct xfs_btree_cur		*cur;
> > +	struct xfs_buf			*agbp;
> > +	struct xfs_reflink_recovery	*rr, *n;
> > +	struct list_head		debris;
> > +	union xfs_btree_irec		low;
> > +	union xfs_btree_irec		high;
> > +	struct xfs_defer_ops		dfops;
> > +	xfs_fsblock_t			fsb;
> > +	int				error;
> > +
> > +	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
> > +	if (error)
> > +		return error;
> > +	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
> > +
> > +	/* Find all the leftover CoW staging extents. */
> > +	INIT_LIST_HEAD(&debris);
> > +	memset(&low, 0, sizeof(low));
> > +	memset(&high, 0, sizeof(high));
> > +	low.rc.rc_startblock = 0;
> > +	high.rc.rc_startblock = -1U;
> > +	error = xfs_btree_query_range(cur, &low, &high,
> > +			xfs_reflink_recover_extent, &debris);
> > +	if (error)
> > +		goto out_error;
> 
> Potential memory leak of debris list entries on error.

Ugh, the error handling in this whole function needs reworking.

> > +	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
> > +	xfs_buf_relse(agbp);
> > +
> > +	/* Now iterate the list to free the leftovers */
> > +	list_for_each_entry(rr, &debris, rr_list) {
> > +		/* Set up transaction. */
> > +		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
> > +		if (error)
> > +			goto out_free;
> > +
> > +		trace_xfs_reflink_recover_extent(mp, agno, &rr->rr_rrec);
> > +
> > +		/* Free the orphan record */
> > +		xfs_defer_init(&dfops, &fsb);
> > +		fsb = XFS_AGB_TO_FSB(mp, agno, rr->rr_rrec.rc_startblock);
> > +		error = xfs_refcount_free_cow_extent(mp, &dfops, fsb,
> > +				rr->rr_rrec.rc_blockcount);
> > +		if (error)
> > +			goto out_defer;
> > +
> > +		/* Free the block. */
> > +		xfs_bmap_add_free(mp, &dfops, fsb,
> > +				rr->rr_rrec.rc_blockcount, NULL);
> > +
> > +		error = xfs_defer_finish(&tp, &dfops, NULL);
> > +		if (error)
> > +			goto out_defer;
> > +
> > +		error = xfs_trans_commit(tp);
> > +		if (error)
> > +			goto out_cancel;
> > +	}
> > +	goto out_free;
> 
> return 0 ?

Yep.  Good catch.

--D

> 
> Brian
> 
> > +
> > +out_defer:
> > +	xfs_defer_cancel(&dfops);
> > +out_cancel:
> > +	xfs_trans_cancel(tp);
> > +
> > +out_free:
> > +	/* Free the leftover list */
> > +	list_for_each_entry_safe(rr, n, &debris, rr_list) {
> > +		list_del(&rr->rr_list);
> > +		kmem_free(rr);
> > +	}
> > +
> > +	return error;
> > +
> > +out_error:
> > +	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
> > +	xfs_buf_relse(agbp);
> > +	return error;
> > +}
> > +
> > +/*
> > + * Free leftover CoW reservations that didn't get cleaned out.
> > + */
> > +int
> > +xfs_reflink_recover_cow(
> > +	struct xfs_mount	*mp)
> > +{
> > +	xfs_agnumber_t		agno;
> > +	int			error = 0;
> > +
> > +	if (!xfs_sb_version_hasreflink(&mp->m_sb))
> > +		return 0;
> > +
> > +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> > +		error = xfs_reflink_recover_cow_ag(mp, agno);
> > +		if (error)
> > +			break;
> > +	}
> > +
> > +	return error;
> > +}
> > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> > index c0c989a..1d2f180 100644
> > --- a/fs/xfs/xfs_reflink.h
> > +++ b/fs/xfs/xfs_reflink.h
> > @@ -42,5 +42,6 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
> >  		xfs_off_t count);
> >  extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
> >  		xfs_off_t count);
> > +extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
> >  
> >  #endif /* __XFS_REFLINK_H */
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index 26b45b3..e6aaa91 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -1306,6 +1306,15 @@ xfs_fs_remount(
> >  		xfs_restore_resvblks(mp);
> >  		xfs_log_work_queue(mp);
> >  		xfs_queue_eofblocks(mp);
> > +
> > +		/* Recover any CoW blocks that never got remapped. */
> > +		error = xfs_reflink_recover_cow(mp);
> > +		if (error) {
> > +			xfs_err(mp,
> > +	"Error %d recovering leftover CoW allocations.", error);
> > +			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> > +			return error;
> > +		}
> >  	}
> >  
> >  	/* rw -> ro */
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index 8e89223..ca0930b 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -2916,14 +2916,18 @@ DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error);
> >  /* refcount adjustment tracepoints */
> >  DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase);
> >  DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease);
> > +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase);
> > +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease);
> >  DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents);
> >  DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent);
> > +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_reflink_recover_extent);
> >  DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent);
> >  DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent);
> >  DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent);
> >  DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent);
> >  DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent);
> >  DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error);
> > +DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error);
> >  DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error);
> >  DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error);
> >  DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error);
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@xxxxxxxxxxxxxxx
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux