On Fri, Dec 09, 2022 at 07:05:19PM +0800, Long Li wrote: > After running unplug disk test and unmount filesystem, the umount thread > hung all the time. > > crash> dmesg > sd 0:0:0:0: rejecting I/O to offline device > XFS (sda): log I/O error -5 > XFS (sda): Corruption of in-memory data (0x8) detected at xfs_defer_finish_noroll+0x12e0/0x1cf0 > (fs/xfs/libxfs/xfs_defer.c:504). Shutting down filesystem. > XFS (sda): Please unmount the filesystem and rectify the problem(s) > XFS (sda): xfs_inactive_ifree: xfs_trans_commit returned error -5 > XFS (sda): Unmounting Filesystem > > crash> bt 3368 > PID: 3368 TASK: ffff88801bcd8040 CPU: 3 COMMAND: "umount" > #0 [ffffc900086a7ae0] __schedule at ffffffff83d3fd25 > #1 [ffffc900086a7be8] schedule at ffffffff83d414dd > #2 [ffffc900086a7c10] xfs_ail_push_all_sync at ffffffff8256db24 > #3 [ffffc900086a7d18] xfs_unmount_flush_inodes at ffffffff824ee7e2 > #4 [ffffc900086a7d28] xfs_unmountfs at ffffffff824f2eff > #5 [ffffc900086a7da8] xfs_fs_put_super at ffffffff82503e69 > #6 [ffffc900086a7de8] generic_shutdown_super at ffffffff81aeb8cd > #7 [ffffc900086a7e10] kill_block_super at ffffffff81aefcfa > #8 [ffffc900086a7e30] deactivate_locked_super at ffffffff81aeb2da > #9 [ffffc900086a7e48] deactivate_super at ffffffff81aeb639 > #10 [ffffc900086a7e68] cleanup_mnt at ffffffff81b6ddd5 > #11 [ffffc900086a7ea0] __cleanup_mnt at ffffffff81b6dfdf > #12 [ffffc900086a7eb0] task_work_run at ffffffff8126e5cf > #13 [ffffc900086a7ef8] exit_to_user_mode_prepare at ffffffff813fa136 > #14 [ffffc900086a7f28] syscall_exit_to_user_mode at ffffffff83d25dbb > #15 [ffffc900086a7f40] do_syscall_64 at ffffffff83d1f8d9 > #16 [ffffc900086a7f50] entry_SYSCALL_64_after_hwframe at ffffffff83e00085 > > When we free a cluster buffer from xfs_ifree_cluster, all the inodes in > cache are marked XFS_ISTALE. On journal commit dirty stale inodes as are > handled by both buffer and inode log items, inodes marked as XFS_ISTALE > in AIL will be removed from the AIL because the buffer log item will clean > it. If the transaction commit fails in the xfs_inactive_ifree(), inodes > marked as XFS_ISTALE will be left in AIL due to buf log item is not > committed, this will cause the unmount thread above to be blocked all the > time. Error handling in xfs_inactive_ifree() is not enough, the above > exception needs to be considered. > > Signed-off-by: Long Li <leo.lilong@xxxxxxxxxx> > --- > fs/xfs/xfs_inode.c | 114 +++++++++++++++++++++++++++++++++++++++++---- > fs/xfs/xfs_inode.h | 1 - > 2 files changed, 105 insertions(+), 10 deletions(-) > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index d354ea2b74f9..b6808c0a2868 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -49,6 +49,9 @@ struct kmem_cache *xfs_inode_cache; > STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); > STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, > struct xfs_inode *); > +STATIC int xfs_ifree(struct xfs_trans *tp, struct xfs_inode *ip, > + struct xfs_icluster *xic); > +STATIC void xfs_ifree_abort(struct xfs_inode *ip, struct xfs_icluster *xic); > > /* > * helper function to extract extent size hint from inode > @@ -1544,6 +1547,7 @@ xfs_inactive_ifree( > { > struct xfs_mount *mp = ip->i_mount; > struct xfs_trans *tp; > + struct xfs_icluster xic = { 0 }; > int error; > > /* > @@ -1598,7 +1602,7 @@ xfs_inactive_ifree( > xfs_ilock(ip, XFS_ILOCK_EXCL); > xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); > > - error = xfs_ifree(tp, ip); > + error = xfs_ifree(tp, ip, &xic); > ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); > if (error) { > /* > @@ -1612,7 +1616,7 @@ xfs_inactive_ifree( > xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); > } > xfs_trans_cancel(tp); > - return error; > + goto out_error; > } > > /* > @@ -1625,11 +1629,19 @@ xfs_inactive_ifree( > * to try to keep going. Make sure it's not a silent error. > */ > error = xfs_trans_commit(tp); > - if (error) > + if (error) { > xfs_notice(mp, "%s: xfs_trans_commit returned error %d", > __func__, error); > + goto out_error; > + } > > return 0; > + > +out_error: > + if (xic.deleted) > + xfs_ifree_abort(ip, &xic); > + > + return error; > } > > /* > @@ -2259,14 +2271,14 @@ xfs_ifree_cluster( > * inodes in the AGI. We need to remove the inode from that list atomically with > * respect to freeing it here. > */ > -int > +STATIC int > xfs_ifree( > struct xfs_trans *tp, > - struct xfs_inode *ip) > + struct xfs_inode *ip, > + struct xfs_icluster *xic) > { > struct xfs_mount *mp = ip->i_mount; > struct xfs_perag *pag; > - struct xfs_icluster xic = { 0 }; > struct xfs_inode_log_item *iip = ip->i_itemp; > int error; > > @@ -2284,7 +2296,7 @@ xfs_ifree( > * makes the AGI lock -> unlinked list modification order the same as > * used in O_TMPFILE creation. > */ > - error = xfs_difree(tp, pag, ip->i_ino, &xic); > + error = xfs_difree(tp, pag, ip->i_ino, xic); > if (error) > goto out; > > @@ -2323,13 +2335,97 @@ xfs_ifree( > VFS_I(ip)->i_generation++; > xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); > > - if (xic.deleted) > - error = xfs_ifree_cluster(tp, pag, ip, &xic); > + if (xic->deleted) > + error = xfs_ifree_cluster(tp, pag, ip, xic); > out: > xfs_perag_put(pag); > return error; > } > > +static void > +xfs_ifree_abort_inode_stale( > + struct xfs_perag *pag, > + xfs_ino_t inum) > +{ > + struct xfs_mount *mp = pag->pag_mount; > + struct xfs_inode_log_item *iip; > + struct xfs_inode *ip; > + > +retry: > + rcu_read_lock(); > + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); > + > + /* Inode not in memory, nothing to do */ > + if (!ip) { > + rcu_read_unlock(); > + return; > + } > + > + /* Skip invalid or not stale inode */ > + if (ip->i_ino != inum || !xfs_iflags_test(ip, XFS_ISTALE)) { > + rcu_read_unlock(); > + return; > + } > + > + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { > + rcu_read_unlock(); > + delay(1); > + goto retry; > + } > + > + iip = ip->i_itemp; > + if (!iip || list_empty(&iip->ili_item.li_bio_list)) > + goto out_iunlock; > + > + if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) > + xfs_iflush_abort(ip); > + else > + xfs_iflags_clear(ip, XFS_IFLUSHING); Er... why is the ifree code tearing into the inode log item state ? Shouldn't this be getting done from the buffer log item when we release it and find that it's aborted? --D > + > +out_iunlock: > + xfs_iunlock(ip, XFS_ILOCK_EXCL); > + rcu_read_unlock(); > +} > + > +/* > + * This is called to clean up inodes marked as stale in xfs_ifree > + */ > +STATIC void > +xfs_ifree_abort( > + struct xfs_inode *ip, > + struct xfs_icluster *xic) > +{ > + struct xfs_mount *mp = ip->i_mount; > + struct xfs_perag *pag; > + struct xfs_ino_geometry *igeo = M_IGEO(mp); > + xfs_ino_t inum = xic->first_ino; > + int nbufs; > + int i, j; > + int ioffset; > + > + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); > + > + nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; > + > + for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { > + /* > + * The allocation bitmap tells us which inodes of the chunk were > + * physically allocated. Skip the cluster if an inode falls into > + * a sparse region. > + */ > + ioffset = inum - xic->first_ino; > + if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { > + ASSERT(ioffset % igeo->inodes_per_cluster == 0); > + continue; > + } > + > + for (i = 0; i < igeo->inodes_per_cluster; i++) > + xfs_ifree_abort_inode_stale(pag, inum + i); > + > + } > + xfs_perag_put(pag); > +} > + > /* > * This is called to unpin an inode. The caller must have the inode locked > * in at least shared mode so that the buffer cannot be subsequently pinned > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index fa780f08dc89..423542bf6af1 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -499,7 +499,6 @@ uint xfs_ilock_data_map_shared(struct xfs_inode *); > uint xfs_ilock_attr_map_shared(struct xfs_inode *); > > uint xfs_ip2xflags(struct xfs_inode *); > -int xfs_ifree(struct xfs_trans *, struct xfs_inode *); > int xfs_itruncate_extents_flags(struct xfs_trans **, > struct xfs_inode *, int, xfs_fsize_t, int); > void xfs_iext_realloc(xfs_inode_t *, int, int); > -- > 2.31.1 >