Re: [PATCH 3/5] xfs: proper replay of deferred ops queued during log recovery

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Sep 29, 2020 at 10:43:31AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> 
> When we replay unfinished intent items that have been recovered from the
> log, it's possible that the replay will cause the creation of more
> deferred work items.  As outlined in commit 509955823cc9c ("xfs: log
> recovery should replay deferred ops in order"), later work items have an
> implicit ordering dependency on earlier work items.  Therefore, recovery
> must replay the items (both recovered and created) in the same order
> that they would have been during normal operation.
> 
> For log recovery, we enforce this ordering by using an empty transaction
> to collect deferred ops that get created in the process of recovering a
> log intent item to prevent them from being committed before the rest of
> the recovered intent items.  After we finish committing all the
> recovered log items, we allocate a transaction with an enormous block
> reservation, splice our huge list of created deferred ops into that
> transaction, and commit it, thereby finishing all those ops.
> 
> This is /really/ hokey -- it's the one place in XFS where we allow
> nested transactions; the splicing of the defer ops list is is inelegant
> and has to be done twice per recovery function; and the broken way we
> handle inode pointers and block reservations cause subtle use-after-free
> and allocator problems that will be fixed by this patch and the two
> patches after it.
> 
> Therefore, replace the hokey empty transaction with a structure designed
> to capture each chain of deferred ops that are created as part of
> recovering a single unfinished log intent.  Finally, refactor the loop
> that replays those chains to do so using one transaction per chain.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> ---
>  fs/xfs/libxfs/xfs_defer.c  |   89 +++++++++++++++++++++++++++++++--
>  fs/xfs/libxfs/xfs_defer.h  |   19 +++++++
>  fs/xfs/xfs_bmap_item.c     |   16 +-----
>  fs/xfs/xfs_extfree_item.c  |    7 +--
>  fs/xfs/xfs_log_recover.c   |  118 +++++++++++++++++++++++++-------------------
>  fs/xfs/xfs_refcount_item.c |   16 +-----
>  fs/xfs/xfs_rmap_item.c     |    7 +--
>  fs/xfs/xfs_trans.h         |    3 +
>  8 files changed, 184 insertions(+), 91 deletions(-)
> 
> 
...
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index 84f876c6d498..550d0fa8057a 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -2434,37 +2434,62 @@ xlog_recover_process_data(
>  /* Take all the collected deferred ops and finish them in order. */
>  static int
>  xlog_finish_defer_ops(
> -	struct xfs_trans	*parent_tp)
> +	struct xfs_mount	*mp,
> +	struct list_head	*capture_list)
>  {
> -	struct xfs_mount	*mp = parent_tp->t_mountp;
> +	struct xfs_defer_capture *dfc, *next;
>  	struct xfs_trans	*tp;
>  	int64_t			freeblks;
> -	uint			resblks;
> -	int			error;
> +	uint64_t		resblks;
> +	int			error = 0;
>  
> -	/*
> -	 * We're finishing the defer_ops that accumulated as a result of
> -	 * recovering unfinished intent items during log recovery.  We
> -	 * reserve an itruncate transaction because it is the largest
> -	 * permanent transaction type.  Since we're the only user of the fs
> -	 * right now, take 93% (15/16) of the available free blocks.  Use
> -	 * weird math to avoid a 64-bit division.
> -	 */
> -	freeblks = percpu_counter_sum(&mp->m_fdblocks);
> -	if (freeblks <= 0)
> -		return -ENOSPC;
> -	resblks = min_t(int64_t, UINT_MAX, freeblks);
> -	resblks = (resblks * 15) >> 4;
> -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
> -			0, XFS_TRANS_RESERVE, &tp);
> -	if (error)
> -		return error;
> -	/* transfer all collected dfops to this transaction */
> -	xfs_defer_move(tp, parent_tp);
> +	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
> +		/*
> +		 * We're finishing the defer_ops that accumulated as a result
> +		 * of recovering unfinished intent items during log recovery.
> +		 * We reserve an itruncate transaction because it is the
> +		 * largest permanent transaction type.  Since we're the only
> +		 * user of the fs right now, take 93% (15/16) of the available
> +		 * free blocks.  Use weird math to avoid a 64-bit division.
> +		 */
> +		freeblks = percpu_counter_sum(&mp->m_fdblocks);
> +		if (freeblks <= 0)
> +			return -ENOSPC;
>  
> -	return xfs_trans_commit(tp);
> +		resblks = min_t(uint64_t, UINT_MAX, freeblks);
> +		resblks = (resblks * 15) >> 4;
> +		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
> +				0, XFS_TRANS_RESERVE, &tp);
> +		if (error)
> +			return error;
> +
> +		/* Transfer all collected dfops to this transaction. */

This old comment is a little misleading now that we have per-intent
capture lists. That nit aside, this looks good to me and the updated
factoring is easier to follow:

Reviewed-by: Brian Foster <bfoster@xxxxxxxxxx>

> +		list_del_init(&dfc->dfc_list);
> +		xfs_defer_ops_continue(dfc, tp);
> +
> +		error = xfs_trans_commit(tp);
> +		if (error)
> +			return error;
> +	}
> +
> +	ASSERT(list_empty(capture_list));
> +	return 0;
>  }
>  
> +/* Release all the captured defer ops and capture structures in this list. */
> +static void
> +xlog_abort_defer_ops(
> +	struct xfs_mount		*mp,
> +	struct list_head		*capture_list)
> +{
> +	struct xfs_defer_capture	*dfc;
> +	struct xfs_defer_capture	*next;
> +
> +	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
> +		list_del_init(&dfc->dfc_list);
> +		xfs_defer_ops_release(mp, dfc);
> +	}
> +}
>  /*
>   * When this is called, all of the log intent items which did not have
>   * corresponding log done items should be in the AIL.  What we do now
> @@ -2485,35 +2510,23 @@ STATIC int
>  xlog_recover_process_intents(
>  	struct xlog		*log)
>  {
> -	struct xfs_trans	*parent_tp;
> +	LIST_HEAD(capture_list);
>  	struct xfs_ail_cursor	cur;
>  	struct xfs_log_item	*lip;
>  	struct xfs_ail		*ailp;
> -	int			error;
> +	int			error = 0;
>  #if defined(DEBUG) || defined(XFS_WARN)
>  	xfs_lsn_t		last_lsn;
>  #endif
>  
> -	/*
> -	 * The intent recovery handlers commit transactions to complete recovery
> -	 * for individual intents, but any new deferred operations that are
> -	 * queued during that process are held off until the very end. The
> -	 * purpose of this transaction is to serve as a container for deferred
> -	 * operations. Each intent recovery handler must transfer dfops here
> -	 * before its local transaction commits, and we'll finish the entire
> -	 * list below.
> -	 */
> -	error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
> -	if (error)
> -		return error;
> -
>  	ailp = log->l_ailp;
>  	spin_lock(&ailp->ail_lock);
> -	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
>  #if defined(DEBUG) || defined(XFS_WARN)
>  	last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
>  #endif
> -	while (lip != NULL) {
> +	for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
> +	     lip != NULL;
> +	     lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
>  		/*
>  		 * We're done when we see something other than an intent.
>  		 * There should be no intents left in the AIL now.
> @@ -2535,24 +2548,29 @@ xlog_recover_process_intents(
>  
>  		/*
>  		 * NOTE: If your intent processing routine can create more
> -		 * deferred ops, you /must/ attach them to the transaction in
> -		 * this routine or else those subsequent intents will get
> +		 * deferred ops, you /must/ attach them to the capture list in
> +		 * the recover routine or else those subsequent intents will be
>  		 * replayed in the wrong order!
>  		 */
>  		spin_unlock(&ailp->ail_lock);
> -		error = lip->li_ops->iop_recover(lip, parent_tp);
> +		error = lip->li_ops->iop_recover(lip, &capture_list);
>  		spin_lock(&ailp->ail_lock);
>  		if (error)
> -			goto out;
> -		lip = xfs_trans_ail_cursor_next(ailp, &cur);
> +			break;
>  	}
> -out:
> +
>  	xfs_trans_ail_cursor_done(&cur);
>  	spin_unlock(&ailp->ail_lock);
> -	if (!error)
> -		error = xlog_finish_defer_ops(parent_tp);
> -	xfs_trans_cancel(parent_tp);
> +	if (error)
> +		goto err;
>  
> +	error = xlog_finish_defer_ops(log->l_mp, &capture_list);
> +	if (error)
> +		goto err;
> +
> +	return 0;
> +err:
> +	xlog_abort_defer_ops(log->l_mp, &capture_list);
>  	return error;
>  }
>  
> diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
> index 3e34b7662361..0478374add64 100644
> --- a/fs/xfs/xfs_refcount_item.c
> +++ b/fs/xfs/xfs_refcount_item.c
> @@ -424,7 +424,7 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
>  STATIC int
>  xfs_cui_item_recover(
>  	struct xfs_log_item		*lip,
> -	struct xfs_trans		*parent_tp)
> +	struct list_head		*capture_list)
>  {
>  	struct xfs_bmbt_irec		irec;
>  	struct xfs_cui_log_item		*cuip = CUI_ITEM(lip);
> @@ -432,7 +432,7 @@ xfs_cui_item_recover(
>  	struct xfs_cud_log_item		*cudp;
>  	struct xfs_trans		*tp;
>  	struct xfs_btree_cur		*rcur = NULL;
> -	struct xfs_mount		*mp = parent_tp->t_mountp;
> +	struct xfs_mount		*mp = lip->li_mountp;
>  	xfs_fsblock_t			startblock_fsb;
>  	xfs_fsblock_t			new_fsb;
>  	xfs_extlen_t			new_len;
> @@ -487,12 +487,7 @@ xfs_cui_item_recover(
>  			mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
>  	if (error)
>  		return error;
> -	/*
> -	 * Recovery stashes all deferred ops during intent processing and
> -	 * finishes them on completion. Transfer current dfops state to this
> -	 * transaction and transfer the result back before we return.
> -	 */
> -	xfs_defer_move(tp, parent_tp);
> +
>  	cudp = xfs_trans_get_cud(tp, cuip);
>  
>  	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
> @@ -549,13 +544,10 @@ xfs_cui_item_recover(
>  	}
>  
>  	xfs_refcount_finish_one_cleanup(tp, rcur, error);
> -	xfs_defer_capture(parent_tp, tp);
> -	error = xfs_trans_commit(tp);
> -	return error;
> +	return xfs_defer_ops_capture_and_commit(tp, capture_list);
>  
>  abort_error:
>  	xfs_refcount_finish_one_cleanup(tp, rcur, error);
> -	xfs_defer_move(parent_tp, tp);
>  	xfs_trans_cancel(tp);
>  	return error;
>  }
> diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
> index e38ec5d736be..0d8fa707f079 100644
> --- a/fs/xfs/xfs_rmap_item.c
> +++ b/fs/xfs/xfs_rmap_item.c
> @@ -467,14 +467,14 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
>  STATIC int
>  xfs_rui_item_recover(
>  	struct xfs_log_item		*lip,
> -	struct xfs_trans		*parent_tp)
> +	struct list_head		*capture_list)
>  {
>  	struct xfs_rui_log_item		*ruip = RUI_ITEM(lip);
>  	struct xfs_map_extent		*rmap;
>  	struct xfs_rud_log_item		*rudp;
>  	struct xfs_trans		*tp;
>  	struct xfs_btree_cur		*rcur = NULL;
> -	struct xfs_mount		*mp = parent_tp->t_mountp;
> +	struct xfs_mount		*mp = lip->li_mountp;
>  	xfs_fsblock_t			startblock_fsb;
>  	enum xfs_rmap_intent_type	type;
>  	xfs_exntst_t			state;
> @@ -567,8 +567,7 @@ xfs_rui_item_recover(
>  	}
>  
>  	xfs_rmap_finish_one_cleanup(tp, rcur, error);
> -	error = xfs_trans_commit(tp);
> -	return error;
> +	return xfs_defer_ops_capture_and_commit(tp, capture_list);
>  
>  abort_error:
>  	xfs_rmap_finish_one_cleanup(tp, rcur, error);
> diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> index ced62a35a62b..186e77d08cc1 100644
> --- a/fs/xfs/xfs_trans.h
> +++ b/fs/xfs/xfs_trans.h
> @@ -72,7 +72,8 @@ struct xfs_item_ops {
>  	void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
>  	void (*iop_release)(struct xfs_log_item *);
>  	xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
> -	int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp);
> +	int (*iop_recover)(struct xfs_log_item *lip,
> +			   struct list_head *capture_list);
>  	bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
>  };
>  
> 




[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux