Re: [PATCH 5/5] xfs: order CIL checkpoint start records

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Jul 14, 2021 at 01:36:56PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> Because log recovery depends on strictly ordered start records as
> well as strictly ordered commit records.
> 
> This is a zero day bug in the way XFS writes pipelined transactions
> to the journal which is exposed by fixing the zero day bug that
> prevents the CIL from pipelining checkpoints. This re-introduces
> explicit concurrent commits back into the on-disk journal and hence
> out of order start records.
> 
> The XFS journal commit code has never ordered start records and we
> have relied on strict commit record ordering for correct recovery
> ordering of concurrently written transactions. Unfortunately, root
> cause analysis uncovered the fact that log recovery uses the LSN of
> the start record for transaction commit processing. Hence, whilst
> the commits are processed in strict order by recovery, the LSNs
> associated with the commits can be out of order and so recovery may
> stamp incorrect LSNs into objects and/or misorder intents in the AIL
> for later processing. This can result in log recovery failures
> and/or on disk corruption, sometimes silent.
> 
> Because this is a long standing log recovery issue, we can't just
> fix log recovery and call it good. This still leaves older kernels
> susceptible to recovery failures and corruption when replaying a log
> from a kernel that pipelines checkpoints. There is also the issue
> that in-memory ordering for AIL pushing and data integrity
> operations are based on checkpoint start LSNs, and if the start LSN
> is incorrect in the journal, it is also incorrect in memory.
> 
> Hence there's really only one choice for fixing this zero-day bug:
> we need to strictly order checkpoint start records in ascending
> sequence order in the log, the same way we already strictly order
> commit records.
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>

Looks good to me now,
Reviewed-by: Darrick J. Wong <djwong@xxxxxxxxxx>

--D

> ---
>  fs/xfs/xfs_log.c      |  1 +
>  fs/xfs/xfs_log_cil.c  | 69 +++++++++++++++++++++++++++++++++++--------
>  fs/xfs/xfs_log_priv.h |  1 +
>  3 files changed, 58 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index f41c6f388456..98d8fea6a83b 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -3777,6 +3777,7 @@ xlog_force_shutdown(
>  	 * avoid races.
>  	 */
>  	spin_lock(&log->l_cilp->xc_push_lock);
> +	wake_up_all(&log->l_cilp->xc_start_wait);
>  	wake_up_all(&log->l_cilp->xc_commit_wait);
>  	spin_unlock(&log->l_cilp->xc_push_lock);
>  	xlog_state_shutdown_callbacks(log);
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index 18a2d6a9f26e..5ef47aed10f9 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -587,6 +587,7 @@ xlog_cil_committed(
>  	 */
>  	if (abort) {
>  		spin_lock(&ctx->cil->xc_push_lock);
> +		wake_up_all(&ctx->cil->xc_start_wait);
>  		wake_up_all(&ctx->cil->xc_commit_wait);
>  		spin_unlock(&ctx->cil->xc_push_lock);
>  	}
> @@ -640,7 +641,14 @@ xlog_cil_set_ctx_write_state(
>  	ASSERT(!ctx->commit_lsn);
>  	if (!ctx->start_lsn) {
>  		spin_lock(&cil->xc_push_lock);
> +		/*
> +		 * The LSN we need to pass to the log items on transaction
> +		 * commit is the LSN reported by the first log vector write, not
> +		 * the commit lsn. If we use the commit record lsn then we can
> +		 * move the tail beyond the grant write head.
> +		 */
>  		ctx->start_lsn = lsn;
> +		wake_up_all(&cil->xc_start_wait);
>  		spin_unlock(&cil->xc_push_lock);
>  		return;
>  	}
> @@ -682,10 +690,16 @@ xlog_cil_set_ctx_write_state(
>   * relies on the context LSN being zero until the log write has guaranteed the
>   * LSN that the log write will start at via xlog_state_get_iclog_space().
>   */
> +enum _record_type {
> +	_START_RECORD,
> +	_COMMIT_RECORD,
> +};
> +
>  static int
>  xlog_cil_order_write(
>  	struct xfs_cil		*cil,
> -	xfs_csn_t		sequence)
> +	xfs_csn_t		sequence,
> +	enum _record_type	record)
>  {
>  	struct xfs_cil_ctx	*ctx;
>  
> @@ -708,19 +722,47 @@ xlog_cil_order_write(
>  		 */
>  		if (ctx->sequence >= sequence)
>  			continue;
> -		if (!ctx->commit_lsn) {
> -			/*
> -			 * It is still being pushed! Wait for the push to
> -			 * complete, then start again from the beginning.
> -			 */
> -			xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
> -			goto restart;
> +
> +		/* Wait until the LSN for the record has been recorded. */
> +		switch (record) {
> +		case _START_RECORD:
> +			if (!ctx->start_lsn) {
> +				xlog_wait(&cil->xc_start_wait, &cil->xc_push_lock);
> +				goto restart;
> +			}
> +			break;
> +		case _COMMIT_RECORD:
> +			if (!ctx->commit_lsn) {
> +				xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
> +				goto restart;
> +			}
> +			break;
>  		}
>  	}
>  	spin_unlock(&cil->xc_push_lock);
>  	return 0;
>  }
>  
> +/*
> + * Write out the log vector change now attached to the CIL context. This will
> + * write a start record that needs to be strictly ordered in ascending CIL
> + * sequence order so that log recovery will always use in-order start LSNs when
> + * replaying checkpoints.
> + */
> +static int
> +xlog_cil_write_chain(
> +	struct xfs_cil_ctx	*ctx,
> +	struct xfs_log_vec	*chain)
> +{
> +	struct xlog		*log = ctx->cil->xc_log;
> +	int			error;
> +
> +	error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD);
> +	if (error)
> +		return error;
> +	return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS);
> +}
> +
>  /*
>   * Write out the commit record of a checkpoint transaction to close off a
>   * running log write. These commit records are strictly ordered in ascending CIL
> @@ -746,6 +788,10 @@ xlog_cil_write_commit_record(
>  	if (xlog_is_shutdown(log))
>  		return -EIO;
>  
> +	error = xlog_cil_order_write(ctx->cil, ctx->sequence, _COMMIT_RECORD);
> +	if (error)
> +		return error;
> +
>  	error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS);
>  	if (error)
>  		xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
> @@ -955,11 +1001,7 @@ xlog_cil_push_work(
>  	 */
>  	wait_for_completion(&bdev_flush);
>  
> -	error = xlog_write(log, ctx, &lvhdr, tic, XLOG_START_TRANS);
> -	if (error)
> -		goto out_abort_free_ticket;
> -
> -	error = xlog_cil_order_write(ctx->cil, ctx->sequence);
> +	error = xlog_cil_write_chain(ctx, &lvhdr);
>  	if (error)
>  		goto out_abort_free_ticket;
>  
> @@ -1364,6 +1406,7 @@ xlog_cil_init(
>  	spin_lock_init(&cil->xc_push_lock);
>  	init_waitqueue_head(&cil->xc_push_wait);
>  	init_rwsem(&cil->xc_ctx_lock);
> +	init_waitqueue_head(&cil->xc_start_wait);
>  	init_waitqueue_head(&cil->xc_commit_wait);
>  
>  	INIT_LIST_HEAD(&ctx->committing);
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index f74e3968bb84..400471fa12d2 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -271,6 +271,7 @@ struct xfs_cil {
>  	xfs_csn_t		xc_push_seq;
>  	struct list_head	xc_committing;
>  	wait_queue_head_t	xc_commit_wait;
> +	wait_queue_head_t	xc_start_wait;
>  	xfs_csn_t		xc_current_sequence;
>  	struct work_struct	xc_push_work;
>  	wait_queue_head_t	xc_push_wait;	/* background push throttle */
> -- 
> 2.31.1
> 



[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux