Re: [PATCH] xfs: fix hung when transaction commit fail in xfs_inactive_ifree

Long Li <leo.lilong@xxxxxxxxxx> · Sat, 11 Feb 2023 19:46:53 +0800



On Tue, Jan 31, 2023 at 07:21:39PM -0800, Darrick J. Wong wrote:
> On Fri, Dec 09, 2022 at 07:05:19PM +0800, Long Li wrote:
> > After running unplug disk test and unmount filesystem, the umount thread
> > hung all the time.
> > 
> >  crash> dmesg
> >  sd 0:0:0:0: rejecting I/O to offline device
> >  XFS (sda): log I/O error -5
> >  XFS (sda): Corruption of in-memory data (0x8) detected at xfs_defer_finish_noroll+0x12e0/0x1cf0
> > 	(fs/xfs/libxfs/xfs_defer.c:504).  Shutting down filesystem.
> >  XFS (sda): Please unmount the filesystem and rectify the problem(s)
> >  XFS (sda): xfs_inactive_ifree: xfs_trans_commit returned error -5
> >  XFS (sda): Unmounting Filesystem
> > 
> >  crash> bt 3368
> >  PID: 3368   TASK: ffff88801bcd8040  CPU: 3   COMMAND: "umount"
> >   #0 [ffffc900086a7ae0] __schedule at ffffffff83d3fd25
> >   #1 [ffffc900086a7be8] schedule at ffffffff83d414dd
> >   #2 [ffffc900086a7c10] xfs_ail_push_all_sync at ffffffff8256db24
> >   #3 [ffffc900086a7d18] xfs_unmount_flush_inodes at ffffffff824ee7e2
> >   #4 [ffffc900086a7d28] xfs_unmountfs at ffffffff824f2eff
> >   #5 [ffffc900086a7da8] xfs_fs_put_super at ffffffff82503e69
> >   #6 [ffffc900086a7de8] generic_shutdown_super at ffffffff81aeb8cd
> >   #7 [ffffc900086a7e10] kill_block_super at ffffffff81aefcfa
> >   #8 [ffffc900086a7e30] deactivate_locked_super at ffffffff81aeb2da
> >   #9 [ffffc900086a7e48] deactivate_super at ffffffff81aeb639
> >  #10 [ffffc900086a7e68] cleanup_mnt at ffffffff81b6ddd5
> >  #11 [ffffc900086a7ea0] __cleanup_mnt at ffffffff81b6dfdf
> >  #12 [ffffc900086a7eb0] task_work_run at ffffffff8126e5cf
> >  #13 [ffffc900086a7ef8] exit_to_user_mode_prepare at ffffffff813fa136
> >  #14 [ffffc900086a7f28] syscall_exit_to_user_mode at ffffffff83d25dbb
> >  #15 [ffffc900086a7f40] do_syscall_64 at ffffffff83d1f8d9
> >  #16 [ffffc900086a7f50] entry_SYSCALL_64_after_hwframe at ffffffff83e00085
> > 
> > When we free a cluster buffer from xfs_ifree_cluster, all the inodes in
> > cache are marked XFS_ISTALE. On journal commit dirty stale inodes as are
> > handled by both buffer and inode log items, inodes marked as XFS_ISTALE
> > in AIL will be removed from the AIL because the buffer log item will clean
> > it. If the transaction commit fails in the xfs_inactive_ifree(), inodes
> > marked as XFS_ISTALE will be left in AIL due to buf log item is not
> > committed, this will cause the unmount thread above to be blocked all the
> > time. Error handling in xfs_inactive_ifree() is not enough, the above
> > exception needs to be considered.
> > 
> > Signed-off-by: Long Li <leo.lilong@xxxxxxxxxx>
> > ---
> >  fs/xfs/xfs_inode.c | 114 +++++++++++++++++++++++++++++++++++++++++----
> >  fs/xfs/xfs_inode.h |   1 -
> >  2 files changed, 105 insertions(+), 10 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index d354ea2b74f9..b6808c0a2868 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -49,6 +49,9 @@ struct kmem_cache *xfs_inode_cache;
> >  STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
> >  STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
> >  	struct xfs_inode *);
> > +STATIC int xfs_ifree(struct xfs_trans *tp, struct xfs_inode *ip,
> > +		struct xfs_icluster *xic);
> > +STATIC void xfs_ifree_abort(struct xfs_inode *ip, struct xfs_icluster *xic);
> >  
> >  /*
> >   * helper function to extract extent size hint from inode
> > @@ -1544,6 +1547,7 @@ xfs_inactive_ifree(
> >  {
> >  	struct xfs_mount	*mp = ip->i_mount;
> >  	struct xfs_trans	*tp;
> > +	struct xfs_icluster     xic = { 0 };
> >  	int			error;
> >  
> >  	/*
> > @@ -1598,7 +1602,7 @@ xfs_inactive_ifree(
> >  	xfs_ilock(ip, XFS_ILOCK_EXCL);
> >  	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
> >  
> > -	error = xfs_ifree(tp, ip);
> > +	error = xfs_ifree(tp, ip, &xic);
> >  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> >  	if (error) {
> >  		/*
> > @@ -1612,7 +1616,7 @@ xfs_inactive_ifree(
> >  			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
> >  		}
> >  		xfs_trans_cancel(tp);
> > -		return error;
> > +		goto out_error;
> >  	}
> >  
> >  	/*
> > @@ -1625,11 +1629,19 @@ xfs_inactive_ifree(
> >  	 * to try to keep going. Make sure it's not a silent error.
> >  	 */
> >  	error = xfs_trans_commit(tp);
> > -	if (error)
> > +	if (error) {
> >  		xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
> >  			__func__, error);
> > +		goto out_error;
> > +	}
> >  
> >  	return 0;
> > +
> > +out_error:
> > +	if (xic.deleted)
> > +		xfs_ifree_abort(ip, &xic);
> > +
> > +	return error;
> >  }
> >  
> >  /*
> > @@ -2259,14 +2271,14 @@ xfs_ifree_cluster(
> >   * inodes in the AGI. We need to remove the inode from that list atomically with
> >   * respect to freeing it here.
> >   */
> > -int
> > +STATIC int
> >  xfs_ifree(
> >  	struct xfs_trans	*tp,
> > -	struct xfs_inode	*ip)
> > +	struct xfs_inode	*ip,
> > +	struct xfs_icluster     *xic)
> >  {
> >  	struct xfs_mount	*mp = ip->i_mount;
> >  	struct xfs_perag	*pag;
> > -	struct xfs_icluster	xic = { 0 };
> >  	struct xfs_inode_log_item *iip = ip->i_itemp;
> >  	int			error;
> >  
> > @@ -2284,7 +2296,7 @@ xfs_ifree(
> >  	 * makes the AGI lock -> unlinked list modification order the same as
> >  	 * used in O_TMPFILE creation.
> >  	 */
> > -	error = xfs_difree(tp, pag, ip->i_ino, &xic);
> > +	error = xfs_difree(tp, pag, ip->i_ino, xic);
> >  	if (error)
> >  		goto out;
> >  
> > @@ -2323,13 +2335,97 @@ xfs_ifree(
> >  	VFS_I(ip)->i_generation++;
> >  	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> >  
> > -	if (xic.deleted)
> > -		error = xfs_ifree_cluster(tp, pag, ip, &xic);
> > +	if (xic->deleted)
> > +		error = xfs_ifree_cluster(tp, pag, ip, xic);
> >  out:
> >  	xfs_perag_put(pag);
> >  	return error;
> >  }
> >  
> > +static void
> > +xfs_ifree_abort_inode_stale(
> > +	struct xfs_perag	*pag,
> > +	xfs_ino_t		inum)
> > +{
> > +	struct xfs_mount        *mp = pag->pag_mount;
> > +	struct xfs_inode_log_item *iip;
> > +	struct xfs_inode	*ip;
> > +
> > +retry:
> > +	rcu_read_lock();
> > +	ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
> > +
> > +	/* Inode not in memory, nothing to do */
> > +	if (!ip) {
> > +		rcu_read_unlock();
> > +		return;
> > +	}
> > +
> > +	/* Skip invalid or not stale inode */
> > +	if (ip->i_ino != inum || !xfs_iflags_test(ip, XFS_ISTALE)) {
> > +		rcu_read_unlock();
> > +		return;
> > +	}
> > +
> > +	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
> > +		rcu_read_unlock();
> > +		delay(1);
> > +		goto retry;
> > +	}
> > +
> > +	iip = ip->i_itemp;
> > +	if (!iip || list_empty(&iip->ili_item.li_bio_list))
> > +		goto out_iunlock;
> > +
> > +	if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags))
> > +		xfs_iflush_abort(ip);
> > +	else
> > +		xfs_iflags_clear(ip, XFS_IFLUSHING);
> 
> Er... why is the ifree code tearing into the inode log item state ?
> 
> Shouldn't this be getting done from the buffer log item when we release
> it and find that it's aborted?
> 
> --D

Yes, it doesn't looks good here, traverse buffer's b_li_list and abort xfs_inode
marked as XFS_ISTALE could be better.
> 
> > +
> > +out_iunlock:
> > +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > +	rcu_read_unlock();
> > +}
> > +
> > +/*
> > + * This is called to clean up inodes marked as stale in xfs_ifree
> > + */
> > +STATIC void
> > +xfs_ifree_abort(
> > +	struct xfs_inode	*ip,
> > +	struct xfs_icluster	*xic)
> > +{
> > +	struct xfs_mount	*mp = ip->i_mount;
> > +	struct xfs_perag        *pag;
> > +	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
> > +	xfs_ino_t		inum = xic->first_ino;
> > +	int			nbufs;
> > +	int			i, j;
> > +	int			ioffset;
> > +
> > +	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
> > +
> > +	nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
> > +
> > +	for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
> > +		/*
> > +		 * The allocation bitmap tells us which inodes of the chunk were
> > +		 * physically allocated. Skip the cluster if an inode falls into
> > +		 * a sparse region.
> > +		 */
> > +		ioffset = inum - xic->first_ino;
> > +		if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
> > +			ASSERT(ioffset % igeo->inodes_per_cluster == 0);
> > +			continue;
> > +		}
> > +
> > +		for (i = 0; i < igeo->inodes_per_cluster; i++)
> > +			xfs_ifree_abort_inode_stale(pag, inum + i);
> > +
> > +	}
> > +	xfs_perag_put(pag);
> > +}
> > +
> >  /*
> >   * This is called to unpin an inode.  The caller must have the inode locked
> >   * in at least shared mode so that the buffer cannot be subsequently pinned
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > index fa780f08dc89..423542bf6af1 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -499,7 +499,6 @@ uint		xfs_ilock_data_map_shared(struct xfs_inode *);
> >  uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
> >  
> >  uint		xfs_ip2xflags(struct xfs_inode *);
> > -int		xfs_ifree(struct xfs_trans *, struct xfs_inode *);
> >  int		xfs_itruncate_extents_flags(struct xfs_trans **,
> >  				struct xfs_inode *, int, xfs_fsize_t, int);
> >  void		xfs_iext_realloc(xfs_inode_t *, int, int);
> > -- 
> > 2.31.1
> >