Re: [PATCH 2/5] xfs: separate out inode buffer recovery a bit more

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Mar 19, 2024 at 01:15:21PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> It really is a unique snowflake, so peal off from normal buffer
> recovery earlier and shuffle all the unique bits into the inode
> buffer recovery function.
> 
> Also, it looks like the handling of mismatched inode cluster buffer
> sizes is wrong - we have to write the recovered buffer -before- we
> mark it stale as we're not supposed to write stale buffers. I don't
> think we check that anywhere in the buffer IO path, but lets do it
> the right way anyway.
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>

Looks good to me,
Reviewed-by: Darrick J. Wong <djwong@xxxxxxxxxx>

--D

> ---
>  fs/xfs/xfs_buf_item_recover.c | 99 ++++++++++++++++++++++-------------
>  1 file changed, 63 insertions(+), 36 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
> index dba57ee6fa6d..f994a303ad0a 100644
> --- a/fs/xfs/xfs_buf_item_recover.c
> +++ b/fs/xfs/xfs_buf_item_recover.c
> @@ -229,7 +229,7 @@ xlog_recover_validate_buf_type(
>  	 * just avoid the verification stage for non-crc filesystems
>  	 */
>  	if (!xfs_has_crc(mp))
> -		return;
> +		return 0;
>  
>  	magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
>  	magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
> @@ -407,7 +407,7 @@ xlog_recover_validate_buf_type(
>  	 * skipped.
>  	 */
>  	if (current_lsn == NULLCOMMITLSN)
> -		return 0;;
> +		return 0;
>  
>  	if (warnmsg) {
>  		xfs_warn(mp, warnmsg);
> @@ -567,18 +567,22 @@ xlog_recover_this_dquot_buffer(
>  }
>  
>  /*
> - * Perform recovery for a buffer full of inodes.  In these buffers, the only
> - * data which should be recovered is that which corresponds to the
> - * di_next_unlinked pointers in the on disk inode structures.  The rest of the
> - * data for the inodes is always logged through the inodes themselves rather
> - * than the inode buffer and is recovered in xlog_recover_inode_pass2().
> + * Perform recovery for a buffer full of inodes. We don't have inode cluster
> + * buffer specific LSNs, so we always recover inode buffers if they contain
> + * inodes.
> + *
> + * In these buffers, the only inode data which should be recovered is that which
> + * corresponds to the di_next_unlinked pointers in the on disk inode structures.
> + * The rest of the data for the inodes is always logged through the inodes
> + * themselves rather than the inode buffer and is recovered in
> + * xlog_recover_inode_pass2().
>   *
>   * The only time when buffers full of inodes are fully recovered is when the
> - * buffer is full of newly allocated inodes.  In this case the buffer will
> - * not be marked as an inode buffer and so will be sent to
> - * xlog_recover_do_reg_buffer() below during recovery.
> + * buffer is full of newly allocated inodes.  In this case the buffer will not
> + * be marked as an inode buffer and so xlog_recover_do_reg_buffer() will be used
> + * instead.
>   */
> -STATIC int
> +static int
>  xlog_recover_do_inode_buffer(
>  	struct xfs_mount		*mp,
>  	struct xlog_recover_item	*item,
> @@ -598,6 +602,13 @@ xlog_recover_do_inode_buffer(
>  
>  	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
>  
> +	/*
> +	 * If the magic number doesn't match, something has gone wrong. Don't
> +	 * recover the buffer.
> +	 */
> +	if (cpu_to_be16(XFS_DINODE_MAGIC) != *((__be16 *)bp->b_addr))
> +		return -EFSCORRUPTED;
> +
>  	/*
>  	 * Post recovery validation only works properly on CRC enabled
>  	 * filesystems.
> @@ -677,6 +688,31 @@ xlog_recover_do_inode_buffer(
>  
>  	}
>  
> +	/*
> +	 * Make sure that only inode buffers with good sizes remain valid after
> +	 * recovering this buffer item.
> +	 *
> +	 * The kernel moves inodes in buffers of 1 block or inode_cluster_size
> +	 * bytes, whichever is bigger.  The inode buffers in the log can be a
> +	 * different size if the log was generated by an older kernel using
> +	 * unclustered inode buffers or a newer kernel running with a different
> +	 * inode cluster size.  Regardless, if the inode buffer size isn't
> +	 * max(blocksize, inode_cluster_size) for *our* value of
> +	 * inode_cluster_size, then we need to keep the buffer out of the buffer
> +	 * cache so that the buffer won't overlap with future reads of those
> +	 * inodes.
> +	 *
> +	 * To acheive this, we write the buffer ito recover the inodes then mark
> +	 * it stale so that it won't be found on overlapping buffer lookups and
> +	 * caller knows not to queue it for delayed write.
> +	 */
> +	if (BBTOB(bp->b_length) != M_IGEO(mp)->inode_cluster_size) {
> +		int error;
> +
> +		error = xfs_bwrite(bp);
> +		xfs_buf_stale(bp);
> +		return error;
> +	}
>  	return 0;
>  }
>  
> @@ -840,7 +876,6 @@ xlog_recover_get_buf_lsn(
>  	magic16 = be16_to_cpu(*(__be16 *)blk);
>  	switch (magic16) {
>  	case XFS_DQUOT_MAGIC:
> -	case XFS_DINODE_MAGIC:
>  		goto recover_immediately;
>  	default:
>  		break;
> @@ -910,6 +945,17 @@ xlog_recover_buf_commit_pass2(
>  	if (error)
>  		return error;
>  
> +	/*
> +	 * Inode buffer recovery is quite unique, so go out separate ways here
> +	 * to simplify the rest of the code.
> +	 */
> +	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
> +		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
> +		if (error || (bp->b_flags & XBF_STALE))
> +			goto out_release;
> +		goto out_write;
> +	}
> +
>  	/*
>  	 * Recover the buffer only if we get an LSN from it and it's less than
>  	 * the lsn of the transaction we are replaying.
> @@ -946,9 +992,7 @@ xlog_recover_buf_commit_pass2(
>  		goto out_release;
>  	}
>  
> -	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
> -		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
> -	} else if (buf_f->blf_flags &
> +	if (buf_f->blf_flags &
>  		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
>  		if (!xlog_recover_this_dquot_buffer(mp, log, item, bp, buf_f))
>  			goto out_release;
> @@ -965,28 +1009,11 @@ xlog_recover_buf_commit_pass2(
>  	/*
>  	 * Perform delayed write on the buffer.  Asynchronous writes will be
>  	 * slower when taking into account all the buffers to be flushed.
> -	 *
> -	 * Also make sure that only inode buffers with good sizes stay in
> -	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
> -	 * or inode_cluster_size bytes, whichever is bigger.  The inode
> -	 * buffers in the log can be a different size if the log was generated
> -	 * by an older kernel using unclustered inode buffers or a newer kernel
> -	 * running with a different inode cluster size.  Regardless, if
> -	 * the inode buffer size isn't max(blocksize, inode_cluster_size)
> -	 * for *our* value of inode_cluster_size, then we need to keep
> -	 * the buffer out of the buffer cache so that the buffer won't
> -	 * overlap with future reads of those inodes.
>  	 */
> -	if (XFS_DINODE_MAGIC ==
> -	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
> -	    (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
> -		xfs_buf_stale(bp);
> -		error = xfs_bwrite(bp);
> -	} else {
> -		ASSERT(bp->b_mount == mp);
> -		bp->b_flags |= _XBF_LOGRECOVERY;
> -		xfs_buf_delwri_queue(bp, buffer_list);
> -	}
> +out_write:
> +	ASSERT(bp->b_mount == mp);
> +	bp->b_flags |= _XBF_LOGRECOVERY;
> +	xfs_buf_delwri_queue(bp, buffer_list);
>  
>  out_release:
>  	xfs_buf_relse(bp);
> -- 
> 2.43.0
> 
> 




[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux