On Mon, May 04, 2020 at 10:11:40AM -0400, Brian Foster wrote: > The inode flush code has several layers of error handling between > the inode and cluster flushing code. If the inode flush fails before > acquiring the backing buffer, the inode flush is aborted. If the > cluster flush fails, the current inode flush is aborted and the > cluster buffer is failed to handle the initial inode and any others > that might have been attached before the error. > > Since xfs_iflush() is the only caller of xfs_iflush_cluster(), the > error handling between the two can be condensed in the top-level > function. If we update xfs_iflush_int() to always fall through to > the log item update and attach the item completion handler to the > buffer, any errors that occur after the first call to > xfs_iflush_int() can be handled with a buffer I/O failure. > > Lift the error handling from xfs_iflush_cluster() into xfs_iflush() > and consolidate with the existing error handling. This also replaces > the need to release the buffer because failing the buffer with > XBF_ASYNC drops the current reference. > > Signed-off-by: Brian Foster <bfoster@xxxxxxxxxx> > Reviewed-by: Christoph Hellwig <hch@xxxxxx> Seems fine to me, Reviewed-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> --D > --- > fs/xfs/xfs_inode.c | 117 +++++++++++++++++---------------------------- > 1 file changed, 45 insertions(+), 72 deletions(-) > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index 909ca7c0bac4..84f2ee9957dc 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -3496,6 +3496,7 @@ xfs_iflush_cluster( > struct xfs_inode **cilist; > struct xfs_inode *cip; > struct xfs_ino_geometry *igeo = M_IGEO(mp); > + int error = 0; > int nr_found; > int clcount = 0; > int i; > @@ -3588,11 +3589,10 @@ xfs_iflush_cluster( > * re-check that it's dirty before flushing. > */ > if (!xfs_inode_clean(cip)) { > - int error; > error = xfs_iflush_int(cip, bp); > if (error) { > xfs_iunlock(cip, XFS_ILOCK_SHARED); > - goto cluster_corrupt_out; > + goto out_free; > } > clcount++; > } else { > @@ -3611,33 +3611,7 @@ xfs_iflush_cluster( > kmem_free(cilist); > out_put: > xfs_perag_put(pag); > - return 0; > - > - > -cluster_corrupt_out: > - /* > - * Corruption detected in the clustering loop. Invalidate the > - * inode buffer and shut down the filesystem. > - */ > - rcu_read_unlock(); > - > - /* > - * We'll always have an inode attached to the buffer for completion > - * process by the time we are called from xfs_iflush(). Hence we have > - * always need to do IO completion processing to abort the inodes > - * attached to the buffer. handle them just like the shutdown case in > - * xfs_buf_submit(). > - */ > - ASSERT(bp->b_iodone); > - bp->b_flags |= XBF_ASYNC; > - xfs_buf_ioend_fail(bp); > - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); > - > - /* abort the corrupt inode, as it was not attached to the buffer */ > - xfs_iflush_abort(cip, false); > - kmem_free(cilist); > - xfs_perag_put(pag); > - return -EFSCORRUPTED; > + return error; > } > > /* > @@ -3693,17 +3667,16 @@ xfs_iflush( > */ > if (XFS_FORCED_SHUTDOWN(mp)) { > error = -EIO; > - goto abort_out; > + goto abort; > } > > /* > * Get the buffer containing the on-disk inode. We are doing a try-lock > - * operation here, so we may get an EAGAIN error. In that case, we > - * simply want to return with the inode still dirty. > + * operation here, so we may get an EAGAIN error. In that case, return > + * leaving the inode dirty. > * > * If we get any other error, we effectively have a corruption situation > - * and we cannot flush the inode, so we treat it the same as failing > - * xfs_iflush_int(). > + * and we cannot flush the inode. Abort the flush and shut down. > */ > error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, > 0); > @@ -3712,14 +3685,7 @@ xfs_iflush( > return error; > } > if (error) > - goto corrupt_out; > - > - /* > - * First flush out the inode that xfs_iflush was called with. > - */ > - error = xfs_iflush_int(ip, bp); > - if (error) > - goto corrupt_out; > + goto abort; > > /* > * If the buffer is pinned then push on the log now so we won't > @@ -3729,28 +3695,29 @@ xfs_iflush( > xfs_log_force(mp, 0); > > /* > - * inode clustering: try to gather other inodes into this write > + * Flush the provided inode then attempt to gather others from the > + * cluster into the write. > * > - * Note: Any error during clustering will result in the filesystem > - * being shut down and completion callbacks run on the cluster buffer. > - * As we have already flushed and attached this inode to the buffer, > - * it has already been aborted and released by xfs_iflush_cluster() and > - * so we have no further error handling to do here. > + * Note: Once we attempt to flush an inode, we must run buffer > + * completion callbacks on any failure. If this fails, simulate an I/O > + * failure on the buffer and shut down. > */ > - error = xfs_iflush_cluster(ip, bp); > - if (error) > - return error; > + error = xfs_iflush_int(ip, bp); > + if (!error) > + error = xfs_iflush_cluster(ip, bp); > + if (error) { > + bp->b_flags |= XBF_ASYNC; > + xfs_buf_ioend_fail(bp); > + goto shutdown; > + } > > *bpp = bp; > return 0; > > -corrupt_out: > - if (bp) > - xfs_buf_relse(bp); > - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); > -abort_out: > - /* abort the corrupt inode, as it was not attached to the buffer */ > +abort: > xfs_iflush_abort(ip, false); > +shutdown: > + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); > return error; > } > > @@ -3792,6 +3759,7 @@ xfs_iflush_int( > struct xfs_inode_log_item *iip = ip->i_itemp; > struct xfs_dinode *dip; > struct xfs_mount *mp = ip->i_mount; > + int error; > > ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); > ASSERT(xfs_isiflocked(ip)); > @@ -3799,15 +3767,21 @@ xfs_iflush_int( > ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); > ASSERT(iip != NULL && iip->ili_fields != 0); > > - /* set *dip = inode's place in the buffer */ > dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); > > + /* > + * We don't flush the inode if any of the following checks fail, but we > + * do still update the log item and attach to the backing buffer as if > + * the flush happened. This is a formality to facilitate predictable > + * error handling as the caller will shutdown and fail the buffer. > + */ > + error = -EFSCORRUPTED; > if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), > mp, XFS_ERRTAG_IFLUSH_1)) { > xfs_alert_tag(mp, XFS_PTAG_IFLUSH, > "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, > __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); > - goto corrupt_out; > + goto flush_out; > } > if (S_ISREG(VFS_I(ip)->i_mode)) { > if (XFS_TEST_ERROR( > @@ -3817,7 +3791,7 @@ xfs_iflush_int( > xfs_alert_tag(mp, XFS_PTAG_IFLUSH, > "%s: Bad regular inode %Lu, ptr "PTR_FMT, > __func__, ip->i_ino, ip); > - goto corrupt_out; > + goto flush_out; > } > } else if (S_ISDIR(VFS_I(ip)->i_mode)) { > if (XFS_TEST_ERROR( > @@ -3828,7 +3802,7 @@ xfs_iflush_int( > xfs_alert_tag(mp, XFS_PTAG_IFLUSH, > "%s: Bad directory inode %Lu, ptr "PTR_FMT, > __func__, ip->i_ino, ip); > - goto corrupt_out; > + goto flush_out; > } > } > if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > > @@ -3839,14 +3813,14 @@ xfs_iflush_int( > __func__, ip->i_ino, > ip->i_d.di_nextents + ip->i_d.di_anextents, > ip->i_d.di_nblocks, ip); > - goto corrupt_out; > + goto flush_out; > } > if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, > mp, XFS_ERRTAG_IFLUSH_6)) { > xfs_alert_tag(mp, XFS_PTAG_IFLUSH, > "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, > __func__, ip->i_ino, ip->i_d.di_forkoff, ip); > - goto corrupt_out; > + goto flush_out; > } > > /* > @@ -3863,7 +3837,7 @@ xfs_iflush_int( > > /* Check the inline fork data before we write out. */ > if (!xfs_inode_verify_forks(ip)) > - goto corrupt_out; > + goto flush_out; > > /* > * Copy the dirty parts of the inode into the on-disk inode. We always > @@ -3906,6 +3880,8 @@ xfs_iflush_int( > * need the AIL lock, because it is a 64 bit value that cannot be read > * atomically. > */ > + error = 0; > +flush_out: > iip->ili_last_fields = iip->ili_fields; > iip->ili_fields = 0; > iip->ili_fsync_fields = 0; > @@ -3915,10 +3891,10 @@ xfs_iflush_int( > &iip->ili_item.li_lsn); > > /* > - * Attach the function xfs_iflush_done to the inode's > - * buffer. This will remove the inode from the AIL > - * and unlock the inode's flush lock when the inode is > - * completely written to disk. > + * Attach the inode item callback to the buffer whether the flush > + * succeeded or not. If not, the caller will shut down and fail I/O > + * completion on the buffer to remove the inode from the AIL and release > + * the flush lock. > */ > xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); > > @@ -3927,10 +3903,7 @@ xfs_iflush_int( > > ASSERT(!list_empty(&bp->b_li_list)); > ASSERT(bp->b_iodone != NULL); > - return 0; > - > -corrupt_out: > - return -EFSCORRUPTED; > + return error; > } > > /* Release an inode. */ > -- > 2.21.1 >