Re: [PATCH v4 03/17] xfs: simplify inode flush error handling

"Darrick J. Wong" <darrick.wong@xxxxxxxxxx> · Mon, 4 May 2020 14:31:27 -0700

On Mon, May 04, 2020 at 10:11:40AM -0400, Brian Foster wrote:
> The inode flush code has several layers of error handling between
> the inode and cluster flushing code. If the inode flush fails before
> acquiring the backing buffer, the inode flush is aborted. If the
> cluster flush fails, the current inode flush is aborted and the
> cluster buffer is failed to handle the initial inode and any others
> that might have been attached before the error.
> 
> Since xfs_iflush() is the only caller of xfs_iflush_cluster(), the
> error handling between the two can be condensed in the top-level
> function. If we update xfs_iflush_int() to always fall through to
> the log item update and attach the item completion handler to the
> buffer, any errors that occur after the first call to
> xfs_iflush_int() can be handled with a buffer I/O failure.
> 
> Lift the error handling from xfs_iflush_cluster() into xfs_iflush()
> and consolidate with the existing error handling. This also replaces
> the need to release the buffer because failing the buffer with
> XBF_ASYNC drops the current reference.
> 
> Signed-off-by: Brian Foster <bfoster@xxxxxxxxxx>
> Reviewed-by: Christoph Hellwig <hch@xxxxxx>

Seems fine to me,
Reviewed-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>

--D

> ---
>  fs/xfs/xfs_inode.c | 117 +++++++++++++++++----------------------------
>  1 file changed, 45 insertions(+), 72 deletions(-)
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 909ca7c0bac4..84f2ee9957dc 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -3496,6 +3496,7 @@ xfs_iflush_cluster(
>  	struct xfs_inode	**cilist;
>  	struct xfs_inode	*cip;
>  	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
> +	int			error = 0;
>  	int			nr_found;
>  	int			clcount = 0;
>  	int			i;
> @@ -3588,11 +3589,10 @@ xfs_iflush_cluster(
>  		 * re-check that it's dirty before flushing.
>  		 */
>  		if (!xfs_inode_clean(cip)) {
> -			int	error;
>  			error = xfs_iflush_int(cip, bp);
>  			if (error) {
>  				xfs_iunlock(cip, XFS_ILOCK_SHARED);
> -				goto cluster_corrupt_out;
> +				goto out_free;
>  			}
>  			clcount++;
>  		} else {
> @@ -3611,33 +3611,7 @@ xfs_iflush_cluster(
>  	kmem_free(cilist);
>  out_put:
>  	xfs_perag_put(pag);
> -	return 0;
> -
> -
> -cluster_corrupt_out:
> -	/*
> -	 * Corruption detected in the clustering loop.  Invalidate the
> -	 * inode buffer and shut down the filesystem.
> -	 */
> -	rcu_read_unlock();
> -
> -	/*
> -	 * We'll always have an inode attached to the buffer for completion
> -	 * process by the time we are called from xfs_iflush(). Hence we have
> -	 * always need to do IO completion processing to abort the inodes
> -	 * attached to the buffer.  handle them just like the shutdown case in
> -	 * xfs_buf_submit().
> -	 */
> -	ASSERT(bp->b_iodone);
> -	bp->b_flags |= XBF_ASYNC;
> -	xfs_buf_ioend_fail(bp);
> -	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> -
> -	/* abort the corrupt inode, as it was not attached to the buffer */
> -	xfs_iflush_abort(cip, false);
> -	kmem_free(cilist);
> -	xfs_perag_put(pag);
> -	return -EFSCORRUPTED;
> +	return error;
>  }
>  
>  /*
> @@ -3693,17 +3667,16 @@ xfs_iflush(
>  	 */
>  	if (XFS_FORCED_SHUTDOWN(mp)) {
>  		error = -EIO;
> -		goto abort_out;
> +		goto abort;
>  	}
>  
>  	/*
>  	 * Get the buffer containing the on-disk inode. We are doing a try-lock
> -	 * operation here, so we may get  an EAGAIN error. In that case, we
> -	 * simply want to return with the inode still dirty.
> +	 * operation here, so we may get an EAGAIN error. In that case, return
> +	 * leaving the inode dirty.
>  	 *
>  	 * If we get any other error, we effectively have a corruption situation
> -	 * and we cannot flush the inode, so we treat it the same as failing
> -	 * xfs_iflush_int().
> +	 * and we cannot flush the inode. Abort the flush and shut down.
>  	 */
>  	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
>  			       0);
> @@ -3712,14 +3685,7 @@ xfs_iflush(
>  		return error;
>  	}
>  	if (error)
> -		goto corrupt_out;
> -
> -	/*
> -	 * First flush out the inode that xfs_iflush was called with.
> -	 */
> -	error = xfs_iflush_int(ip, bp);
> -	if (error)
> -		goto corrupt_out;
> +		goto abort;
>  
>  	/*
>  	 * If the buffer is pinned then push on the log now so we won't
> @@ -3729,28 +3695,29 @@ xfs_iflush(
>  		xfs_log_force(mp, 0);
>  
>  	/*
> -	 * inode clustering: try to gather other inodes into this write
> +	 * Flush the provided inode then attempt to gather others from the
> +	 * cluster into the write.
>  	 *
> -	 * Note: Any error during clustering will result in the filesystem
> -	 * being shut down and completion callbacks run on the cluster buffer.
> -	 * As we have already flushed and attached this inode to the buffer,
> -	 * it has already been aborted and released by xfs_iflush_cluster() and
> -	 * so we have no further error handling to do here.
> +	 * Note: Once we attempt to flush an inode, we must run buffer
> +	 * completion callbacks on any failure. If this fails, simulate an I/O
> +	 * failure on the buffer and shut down.
>  	 */
> -	error = xfs_iflush_cluster(ip, bp);
> -	if (error)
> -		return error;
> +	error = xfs_iflush_int(ip, bp);
> +	if (!error)
> +		error = xfs_iflush_cluster(ip, bp);
> +	if (error) {
> +		bp->b_flags |= XBF_ASYNC;
> +		xfs_buf_ioend_fail(bp);
> +		goto shutdown;
> +	}
>  
>  	*bpp = bp;
>  	return 0;
>  
> -corrupt_out:
> -	if (bp)
> -		xfs_buf_relse(bp);
> -	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> -abort_out:
> -	/* abort the corrupt inode, as it was not attached to the buffer */
> +abort:
>  	xfs_iflush_abort(ip, false);
> +shutdown:
> +	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
>  	return error;
>  }
>  
> @@ -3792,6 +3759,7 @@ xfs_iflush_int(
>  	struct xfs_inode_log_item *iip = ip->i_itemp;
>  	struct xfs_dinode	*dip;
>  	struct xfs_mount	*mp = ip->i_mount;
> +	int			error;
>  
>  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
>  	ASSERT(xfs_isiflocked(ip));
> @@ -3799,15 +3767,21 @@ xfs_iflush_int(
>  	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
>  	ASSERT(iip != NULL && iip->ili_fields != 0);
>  
> -	/* set *dip = inode's place in the buffer */
>  	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
>  
> +	/*
> +	 * We don't flush the inode if any of the following checks fail, but we
> +	 * do still update the log item and attach to the backing buffer as if
> +	 * the flush happened. This is a formality to facilitate predictable
> +	 * error handling as the caller will shutdown and fail the buffer.
> +	 */
> +	error = -EFSCORRUPTED;
>  	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
>  			       mp, XFS_ERRTAG_IFLUSH_1)) {
>  		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
>  			"%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT,
>  			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
> -		goto corrupt_out;
> +		goto flush_out;
>  	}
>  	if (S_ISREG(VFS_I(ip)->i_mode)) {
>  		if (XFS_TEST_ERROR(
> @@ -3817,7 +3791,7 @@ xfs_iflush_int(
>  			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
>  				"%s: Bad regular inode %Lu, ptr "PTR_FMT,
>  				__func__, ip->i_ino, ip);
> -			goto corrupt_out;
> +			goto flush_out;
>  		}
>  	} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
>  		if (XFS_TEST_ERROR(
> @@ -3828,7 +3802,7 @@ xfs_iflush_int(
>  			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
>  				"%s: Bad directory inode %Lu, ptr "PTR_FMT,
>  				__func__, ip->i_ino, ip);
> -			goto corrupt_out;
> +			goto flush_out;
>  		}
>  	}
>  	if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
> @@ -3839,14 +3813,14 @@ xfs_iflush_int(
>  			__func__, ip->i_ino,
>  			ip->i_d.di_nextents + ip->i_d.di_anextents,
>  			ip->i_d.di_nblocks, ip);
> -		goto corrupt_out;
> +		goto flush_out;
>  	}
>  	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
>  				mp, XFS_ERRTAG_IFLUSH_6)) {
>  		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
>  			"%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT,
>  			__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
> -		goto corrupt_out;
> +		goto flush_out;
>  	}
>  
>  	/*
> @@ -3863,7 +3837,7 @@ xfs_iflush_int(
>  
>  	/* Check the inline fork data before we write out. */
>  	if (!xfs_inode_verify_forks(ip))
> -		goto corrupt_out;
> +		goto flush_out;
>  
>  	/*
>  	 * Copy the dirty parts of the inode into the on-disk inode.  We always
> @@ -3906,6 +3880,8 @@ xfs_iflush_int(
>  	 * need the AIL lock, because it is a 64 bit value that cannot be read
>  	 * atomically.
>  	 */
> +	error = 0;
> +flush_out:
>  	iip->ili_last_fields = iip->ili_fields;
>  	iip->ili_fields = 0;
>  	iip->ili_fsync_fields = 0;
> @@ -3915,10 +3891,10 @@ xfs_iflush_int(
>  				&iip->ili_item.li_lsn);
>  
>  	/*
> -	 * Attach the function xfs_iflush_done to the inode's
> -	 * buffer.  This will remove the inode from the AIL
> -	 * and unlock the inode's flush lock when the inode is
> -	 * completely written to disk.
> +	 * Attach the inode item callback to the buffer whether the flush
> +	 * succeeded or not. If not, the caller will shut down and fail I/O
> +	 * completion on the buffer to remove the inode from the AIL and release
> +	 * the flush lock.
>  	 */
>  	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
>  
> @@ -3927,10 +3903,7 @@ xfs_iflush_int(
>  
>  	ASSERT(!list_empty(&bp->b_li_list));
>  	ASSERT(bp->b_iodone != NULL);
> -	return 0;
> -
> -corrupt_out:
> -	return -EFSCORRUPTED;
> +	return error;
>  }
>  
>  /* Release an inode. */
> -- 
> 2.21.1
>