On Tue, Jan 31, 2023 at 07:21:39PM -0800, Darrick J. Wong wrote: > On Fri, Dec 09, 2022 at 07:05:19PM +0800, Long Li wrote: > > After running unplug disk test and unmount filesystem, the umount thread > > hung all the time. > > > > crash> dmesg > > sd 0:0:0:0: rejecting I/O to offline device > > XFS (sda): log I/O error -5 > > XFS (sda): Corruption of in-memory data (0x8) detected at xfs_defer_finish_noroll+0x12e0/0x1cf0 > > (fs/xfs/libxfs/xfs_defer.c:504). Shutting down filesystem. > > XFS (sda): Please unmount the filesystem and rectify the problem(s) > > XFS (sda): xfs_inactive_ifree: xfs_trans_commit returned error -5 > > XFS (sda): Unmounting Filesystem > > > > crash> bt 3368 > > PID: 3368 TASK: ffff88801bcd8040 CPU: 3 COMMAND: "umount" > > #0 [ffffc900086a7ae0] __schedule at ffffffff83d3fd25 > > #1 [ffffc900086a7be8] schedule at ffffffff83d414dd > > #2 [ffffc900086a7c10] xfs_ail_push_all_sync at ffffffff8256db24 > > #3 [ffffc900086a7d18] xfs_unmount_flush_inodes at ffffffff824ee7e2 > > #4 [ffffc900086a7d28] xfs_unmountfs at ffffffff824f2eff > > #5 [ffffc900086a7da8] xfs_fs_put_super at ffffffff82503e69 > > #6 [ffffc900086a7de8] generic_shutdown_super at ffffffff81aeb8cd > > #7 [ffffc900086a7e10] kill_block_super at ffffffff81aefcfa > > #8 [ffffc900086a7e30] deactivate_locked_super at ffffffff81aeb2da > > #9 [ffffc900086a7e48] deactivate_super at ffffffff81aeb639 > > #10 [ffffc900086a7e68] cleanup_mnt at ffffffff81b6ddd5 > > #11 [ffffc900086a7ea0] __cleanup_mnt at ffffffff81b6dfdf > > #12 [ffffc900086a7eb0] task_work_run at ffffffff8126e5cf > > #13 [ffffc900086a7ef8] exit_to_user_mode_prepare at ffffffff813fa136 > > #14 [ffffc900086a7f28] syscall_exit_to_user_mode at ffffffff83d25dbb > > #15 [ffffc900086a7f40] do_syscall_64 at ffffffff83d1f8d9 > > #16 [ffffc900086a7f50] entry_SYSCALL_64_after_hwframe at ffffffff83e00085 > > > > When we free a cluster buffer from xfs_ifree_cluster, all the inodes in > > cache are marked XFS_ISTALE. On journal commit dirty stale inodes as are > > handled by both buffer and inode log items, inodes marked as XFS_ISTALE > > in AIL will be removed from the AIL because the buffer log item will clean > > it. If the transaction commit fails in the xfs_inactive_ifree(), inodes > > marked as XFS_ISTALE will be left in AIL due to buf log item is not > > committed, this will cause the unmount thread above to be blocked all the > > time. Error handling in xfs_inactive_ifree() is not enough, the above > > exception needs to be considered. > > > > Signed-off-by: Long Li <leo.lilong@xxxxxxxxxx> > > --- > > fs/xfs/xfs_inode.c | 114 +++++++++++++++++++++++++++++++++++++++++---- > > fs/xfs/xfs_inode.h | 1 - > > 2 files changed, 105 insertions(+), 10 deletions(-) > > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > > index d354ea2b74f9..b6808c0a2868 100644 > > --- a/fs/xfs/xfs_inode.c > > +++ b/fs/xfs/xfs_inode.c > > @@ -49,6 +49,9 @@ struct kmem_cache *xfs_inode_cache; > > STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); > > STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, > > struct xfs_inode *); > > +STATIC int xfs_ifree(struct xfs_trans *tp, struct xfs_inode *ip, > > + struct xfs_icluster *xic); > > +STATIC void xfs_ifree_abort(struct xfs_inode *ip, struct xfs_icluster *xic); > > > > /* > > * helper function to extract extent size hint from inode > > @@ -1544,6 +1547,7 @@ xfs_inactive_ifree( > > { > > struct xfs_mount *mp = ip->i_mount; > > struct xfs_trans *tp; > > + struct xfs_icluster xic = { 0 }; > > int error; > > > > /* > > @@ -1598,7 +1602,7 @@ xfs_inactive_ifree( > > xfs_ilock(ip, XFS_ILOCK_EXCL); > > xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); > > > > - error = xfs_ifree(tp, ip); > > + error = xfs_ifree(tp, ip, &xic); > > ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); > > if (error) { > > /* > > @@ -1612,7 +1616,7 @@ xfs_inactive_ifree( > > xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); > > } > > xfs_trans_cancel(tp); > > - return error; > > + goto out_error; > > } > > > > /* > > @@ -1625,11 +1629,19 @@ xfs_inactive_ifree( > > * to try to keep going. Make sure it's not a silent error. > > */ > > error = xfs_trans_commit(tp); > > - if (error) > > + if (error) { > > xfs_notice(mp, "%s: xfs_trans_commit returned error %d", > > __func__, error); > > + goto out_error; > > + } > > > > return 0; > > + > > +out_error: > > + if (xic.deleted) > > + xfs_ifree_abort(ip, &xic); > > + > > + return error; > > } > > > > /* > > @@ -2259,14 +2271,14 @@ xfs_ifree_cluster( > > * inodes in the AGI. We need to remove the inode from that list atomically with > > * respect to freeing it here. > > */ > > -int > > +STATIC int > > xfs_ifree( > > struct xfs_trans *tp, > > - struct xfs_inode *ip) > > + struct xfs_inode *ip, > > + struct xfs_icluster *xic) > > { > > struct xfs_mount *mp = ip->i_mount; > > struct xfs_perag *pag; > > - struct xfs_icluster xic = { 0 }; > > struct xfs_inode_log_item *iip = ip->i_itemp; > > int error; > > > > @@ -2284,7 +2296,7 @@ xfs_ifree( > > * makes the AGI lock -> unlinked list modification order the same as > > * used in O_TMPFILE creation. > > */ > > - error = xfs_difree(tp, pag, ip->i_ino, &xic); > > + error = xfs_difree(tp, pag, ip->i_ino, xic); > > if (error) > > goto out; > > > > @@ -2323,13 +2335,97 @@ xfs_ifree( > > VFS_I(ip)->i_generation++; > > xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); > > > > - if (xic.deleted) > > - error = xfs_ifree_cluster(tp, pag, ip, &xic); > > + if (xic->deleted) > > + error = xfs_ifree_cluster(tp, pag, ip, xic); > > out: > > xfs_perag_put(pag); > > return error; > > } > > > > +static void > > +xfs_ifree_abort_inode_stale( > > + struct xfs_perag *pag, > > + xfs_ino_t inum) > > +{ > > + struct xfs_mount *mp = pag->pag_mount; > > + struct xfs_inode_log_item *iip; > > + struct xfs_inode *ip; > > + > > +retry: > > + rcu_read_lock(); > > + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); > > + > > + /* Inode not in memory, nothing to do */ > > + if (!ip) { > > + rcu_read_unlock(); > > + return; > > + } > > + > > + /* Skip invalid or not stale inode */ > > + if (ip->i_ino != inum || !xfs_iflags_test(ip, XFS_ISTALE)) { > > + rcu_read_unlock(); > > + return; > > + } > > + > > + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { > > + rcu_read_unlock(); > > + delay(1); > > + goto retry; > > + } > > + > > + iip = ip->i_itemp; > > + if (!iip || list_empty(&iip->ili_item.li_bio_list)) > > + goto out_iunlock; > > + > > + if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) > > + xfs_iflush_abort(ip); > > + else > > + xfs_iflags_clear(ip, XFS_IFLUSHING); > > Er... why is the ifree code tearing into the inode log item state ? > > Shouldn't this be getting done from the buffer log item when we release > it and find that it's aborted? > > --D Yes, it doesn't looks good here, traverse buffer's b_li_list and abort xfs_inode marked as XFS_ISTALE could be better. > > > + > > +out_iunlock: > > + xfs_iunlock(ip, XFS_ILOCK_EXCL); > > + rcu_read_unlock(); > > +} > > + > > +/* > > + * This is called to clean up inodes marked as stale in xfs_ifree > > + */ > > +STATIC void > > +xfs_ifree_abort( > > + struct xfs_inode *ip, > > + struct xfs_icluster *xic) > > +{ > > + struct xfs_mount *mp = ip->i_mount; > > + struct xfs_perag *pag; > > + struct xfs_ino_geometry *igeo = M_IGEO(mp); > > + xfs_ino_t inum = xic->first_ino; > > + int nbufs; > > + int i, j; > > + int ioffset; > > + > > + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); > > + > > + nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; > > + > > + for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { > > + /* > > + * The allocation bitmap tells us which inodes of the chunk were > > + * physically allocated. Skip the cluster if an inode falls into > > + * a sparse region. > > + */ > > + ioffset = inum - xic->first_ino; > > + if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { > > + ASSERT(ioffset % igeo->inodes_per_cluster == 0); > > + continue; > > + } > > + > > + for (i = 0; i < igeo->inodes_per_cluster; i++) > > + xfs_ifree_abort_inode_stale(pag, inum + i); > > + > > + } > > + xfs_perag_put(pag); > > +} > > + > > /* > > * This is called to unpin an inode. The caller must have the inode locked > > * in at least shared mode so that the buffer cannot be subsequently pinned > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > > index fa780f08dc89..423542bf6af1 100644 > > --- a/fs/xfs/xfs_inode.h > > +++ b/fs/xfs/xfs_inode.h > > @@ -499,7 +499,6 @@ uint xfs_ilock_data_map_shared(struct xfs_inode *); > > uint xfs_ilock_attr_map_shared(struct xfs_inode *); > > > > uint xfs_ip2xflags(struct xfs_inode *); > > -int xfs_ifree(struct xfs_trans *, struct xfs_inode *); > > int xfs_itruncate_extents_flags(struct xfs_trans **, > > struct xfs_inode *, int, xfs_fsize_t, int); > > void xfs_iext_realloc(xfs_inode_t *, int, int); > > -- > > 2.31.1 > >