On Fri, May 22, 2020 at 01:50:27PM +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > Now that we have all the dirty inodes attached to the cluster > buffer, we don't actually have to do radix tree lookups to find > them. Sure, the radix tree is efficient, but walking a linked list > of just the dirty inodes attached to the buffer is much better. > > We are also no longer dependent on having a locked inode passed into > the function to determine where to start the lookup. This means we > can drop it from the function call and treat all inodes the same. > > We also make xfs_iflush_cluster skip inodes marked with > XFS_IRECLAIM. This we avoid races with inodes that reclaim is > actively referencing or are being re-initialised by inode lookup. If > they are actually dirty, they'll get written by a future cluster > flush.... > > We also add a shutdown check after obtaining the flush lock so that > we catch inodes that are dirty in memory and may have inconsistent > state due to the shutdown in progress. We abort these inodes > directly and so they remove themselves directly from the buffer list > and the AIL rather than having to wait for the buffer to be failed > and callbacks run to be processed correctly. I suspect we should just kill off xfs_iflush_cluster with this, as it is best split between xfs_iflush and xfs_inode_item_push. POC patch below, but as part of that I noticed some really odd error handling, which I'll bring up separately. diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 6f873eca8c916..8784e70626403 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1102,12 +1102,12 @@ xfs_reclaim_inode( /* * Because we use RCU freeing we need to ensure the inode always appears * to be reclaimed with an invalid inode number when in the free state. - * We do this as early as possible under the ILOCK so that - * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to - * detect races with us here. By doing this, we guarantee that once - * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that - * it will see either a valid inode that will serialise correctly, or it - * will see an invalid inode that it can skip. + * We do this as early as possible under the ILOCK so that xfs_iflush() + * and xfs_ifree_cluster() can be guaranteed to detect races with us + * here. By doing this, we guarantee that once xfs_iflush() or + * xfs_ifree_cluster() has locked XFS_ILOCK that it will see either a + * valid inode that will serialise correctly, or it will see an invalid + * inode that it can skip. */ spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5bbdb0c5ed172..8b0197a386a0f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3431,19 +3431,70 @@ xfs_rename( int xfs_iflush( - struct xfs_inode *ip, + struct xfs_inode_log_item *iip, struct xfs_buf *bp) { - struct xfs_inode_log_item *iip = ip->i_itemp; - struct xfs_dinode *dip; + struct xfs_inode *ip = iip->ili_inode; struct xfs_mount *mp = ip->i_mount; - int error; + struct xfs_dinode *dip; + int error = 0; + + /* + * Quick and dirty check to avoid locks if possible. + */ + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) + return 0; + if (xfs_ipincount(ip)) + return 0; + + /* + * The inode is still attached to the buffer, which means it is dirty + * but reclaim might try to grab it. Check carefully for that, and grab + * the ilock while still holding the i_flags_lock to guarantee reclaim + * will not be able to reclaim this inode once we drop the i_flags_lock. + */ + spin_lock(&ip->i_flags_lock); + ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) + goto out_unlock_flags_lock; + + /* + * ILOCK will pin the inode against reclaim and prevent concurrent + * transactions modifying the inode while we are flushing the inode. + */ + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) + goto out_unlock_flags_lock; + spin_unlock(&ip->i_flags_lock); + + /* + * Skip inodes that are already flush locked as they have already been + * written to the buffer. + */ + if (!xfs_iflock_nowait(ip)) + goto out_unlock_ilock; + + /* + * If we are shut down, unpin and abort the inode now as there is no + * point in flushing it to the buffer just to get an IO completion to + * abort the buffer and remove it from the AIL. + */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_iunpin_wait(ip); + /* xfs_iflush_abort() drops the flush lock */ + xfs_iflush_abort(ip); + error = -EIO; + goto out_unlock_ilock; + } + + /* don't block waiting on a log force to unpin dirty inodes */ + if (xfs_ipincount(ip) || xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + goto out_unlock_ilock; + } - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(iip->ili_fields != 0); ASSERT(iip->ili_item.li_buf == bp); dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -3574,122 +3625,13 @@ xfs_iflush( /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); - return error; -} - -/* - * Non-blocking flush of dirty inode metadata into the backing buffer. - * - * The caller must have a reference to the inode and hold the cluster buffer - * locked. The function will walk across all the inodes on the cluster buffer it - * can find and lock without blocking, and flush them to the cluster buffer. - * - * On success, the caller must write out the buffer returned in *bp and - * release it. On failure, the filesystem will be shut down, the buffer will - * have been unlocked and released, and EFSCORRUPTED will be returned. - */ -int -xfs_iflush_cluster( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - struct xfs_log_item *lip, *n; - struct xfs_inode *ip; - struct xfs_inode_log_item *iip; - int clcount = 0; - int error = 0; - /* - * We must use the safe variant here as on shutdown xfs_iflush_abort() - * can remove itself from the list. - */ - list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { - iip = (struct xfs_inode_log_item *)lip; - ip = iip->ili_inode; - - /* - * Quick and dirty check to avoid locks if possible. - */ - if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) - continue; - if (xfs_ipincount(ip)) - continue; - - /* - * The inode is still attached to the buffer, which means it is - * dirty but reclaim might try to grab it. Check carefully for - * that, and grab the ilock while still holding the i_flags_lock - * to guarantee reclaim will not be able to reclaim this inode - * once we drop the i_flags_lock. - */ - spin_lock(&ip->i_flags_lock); - ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); - if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) { - spin_unlock(&ip->i_flags_lock); - continue; - } - - /* - * ILOCK will pin the inode against reclaim and prevent - * concurrent transactions modifying the inode while we are - * flushing the inode. - */ - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { - spin_unlock(&ip->i_flags_lock); - continue; - } - spin_unlock(&ip->i_flags_lock); - - /* - * Skip inodes that are already flush locked as they have - * already been written to the buffer. - */ - if (!xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - continue; - } - - /* - * If we are shut down, unpin and abort the inode now as there - * is no point in flushing it to the buffer just to get an IO - * completion to abort the buffer and remove it from the AIL. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - xfs_iunpin_wait(ip); - /* xfs_iflush_abort() drops the flush lock */ - xfs_iflush_abort(ip); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - error = EIO; - continue; - } - - /* don't block waiting on a log force to unpin dirty inodes */ - if (xfs_ipincount(ip)) { - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - continue; - } - - if (!xfs_inode_clean(ip)) - error = xfs_iflush(ip, bp); - else - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (error) - break; - clcount++; - } - - if (clcount) { - XFS_STATS_INC(mp, xs_icluster_flushcnt); - XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); - } +out_unlock_ilock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; - if (error) { - bp->b_flags |= XBF_ASYNC; - xfs_buf_ioend_fail(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - } +out_unlock_flags_lock: + spin_unlock(&ip->i_flags_lock); return error; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b93cf9076df8a..870e6768e119e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -427,7 +427,7 @@ int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) -int xfs_iflush_cluster(struct xfs_buf *); +int xfs_iflush(struct xfs_inode_log_item *iip, struct xfs_buf *bp); void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 4dd4f45dcc46e..b82d7f3628745 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -501,8 +501,10 @@ xfs_inode_item_push( struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; struct xfs_buf *bp = lip->li_buf; + struct xfs_mount *mp = bp->b_mount; + struct xfs_log_item *l, *n; uint rval = XFS_ITEM_SUCCESS; - int error; + int clcount = 0; ASSERT(iip->ili_item.li_buf); @@ -530,17 +532,34 @@ xfs_inode_item_push( * for IO until we queue it for delwri submission. */ xfs_buf_hold(bp); - error = xfs_iflush_cluster(bp); - if (!error) { - if (!xfs_buf_delwri_queue(bp, buffer_list)) { - ASSERT(0); - rval = XFS_ITEM_FLUSHING; + + /* + * We must use the safe variant here as on shutdown xfs_iflush_abort() + * can remove itself from the list. + */ + list_for_each_entry_safe(l, n, &bp->b_li_list, li_bio_list) { + if (xfs_iflush(INODE_ITEM(l), bp)) { + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + rval = XFS_ITEM_LOCKED; + goto out_unlock; } - xfs_buf_relse(bp); - } else { - rval = XFS_ITEM_LOCKED; + clcount++; + } + + if (clcount) { + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); + } + + if (!xfs_buf_delwri_queue(bp, buffer_list)) { + ASSERT(0); + rval = XFS_ITEM_FLUSHING; } + xfs_buf_relse(bp); +out_unlock: spin_lock(&lip->li_ailp->ail_lock); return rval; }