Re: [PATCH 3/4] xfs: track unlinked inodes in core inode

Gao Xiang <hsiangkao@xxxxxxxxxx> · Tue, 7 Jul 2020 22:39:08 +0800

Hi Dave,

During working on this stuff, I found some another noticiable places
to mention. Hope it of some help...

On Tue, Jun 23, 2020 at 07:50:14PM +1000, Dave Chinner wrote:

...

>  /*
> - * This is called when the inode's link count has gone to 0 or we are creating
> - * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
> - *
> - * We place the on-disk inode on a list in the AGI.  It will be pulled from this
> - * list when the inode is freed.
> + * Always insert at the head, so we only have to do a next inode lookup to
> + * update it's prev pointer. The AGI bucket will point at the one we are
> + * inserting.
>   */
> -STATIC int
> -xfs_iunlink(
> +static int
> +xfs_iunlink_insert_inode(
>  	struct xfs_trans	*tp,
> +	struct xfs_buf		*agibp,
>  	struct xfs_inode	*ip)
>  {
>  	struct xfs_mount	*mp = tp->t_mountp;
> -	struct xfs_agi		*agi;
> -	struct xfs_buf		*agibp;
> -	xfs_agino_t		next_agino;
> -	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> +	struct xfs_agi		*agi = agibp->b_addr;
> +	xfs_agino_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);

unnecessary modification here... (use xfs_agnumber_t instead)

> -/* Return the imap, dinode pointer, and buffer for an inode. */
> -STATIC int
> -xfs_iunlink_map_ino(
> +/*
> + * Remove can be from anywhere in the list, so we have to do two adjacent inode
> + * lookups here so we can update list pointers. We may be at the head or the
> + * tail of the list, so we have to handle those cases as well.
> + */
> +static int
> +xfs_iunlink_remove_inode(
>  	struct xfs_trans	*tp,
> -	xfs_agnumber_t		agno,
> -	xfs_agino_t		agino,
> -	struct xfs_imap		*imap,
> -	struct xfs_dinode	**dipp,
> -	struct xfs_buf		**bpp)
> +	struct xfs_buf		*agibp,
> +	struct xfs_inode	*ip)
>  {
>  	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_agi		*agi = agibp->b_addr;
> +	xfs_agino_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);

same here

>  /*
> @@ -2762,56 +2735,107 @@ xlog_recover_process_one_iunlink(
>   * scheduled on this CPU to ensure other scheduled work can run without undue
>   * latency.
>   */
> -STATIC void
> -xlog_recover_process_iunlinks(
> -	struct xlog	*log)
> +static int
> +xlog_recover_iunlink_ag(
> +	struct xfs_mount	*mp,
> +	xfs_agnumber_t		agno)
>  {
> -	xfs_mount_t	*mp;
> -	xfs_agnumber_t	agno;
> -	xfs_agi_t	*agi;
> -	xfs_buf_t	*agibp;
> -	xfs_agino_t	agino;
> -	int		bucket;
> -	int		error;
> +	struct xfs_agi		*agi;
> +	struct xfs_buf		*agibp;
> +	int			bucket;
> +	int			error;
>  
> -	mp = log->l_mp;
> +	/*
> +	 * Find the agi for this ag.
> +	 */
> +	error = xfs_read_agi(mp, NULL, agno, &agibp);
> +	if (error) {
>  
> -	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
>  		/*
> -		 * Find the agi for this ag.
> +		 * AGI is b0rked. Don't process it.
> +		 *
> +		 * We should probably mark the filesystem as corrupt after we've
> +		 * recovered all the ag's we can....
>  		 */
> -		error = xfs_read_agi(mp, NULL, agno, &agibp);
> +		return 0;
> +	}
> +
> +	/*
> +	 * Unlock the buffer so that it can be acquired in the normal course of
> +	 * the transaction to truncate and free each inode.  Because we are not
> +	 * racing with anyone else here for the AGI buffer, we don't even need
> +	 * to hold it locked to read the initial unlinked bucket entries out of
> +	 * the buffer. We keep buffer reference though, so that it stays pinned
> +	 * in memory while we need the buffer.
> +	 */
> +	agi = agibp->b_addr;
> +	xfs_buf_unlock(agibp);
> +
> +	/*
> +	 * The unlinked inode list is maintained on incore inodes as a double
> +	 * linked list. We don't have any of that state in memory, so we have to
> +	 * create it as we go. This is simple as we are only removing from the
> +	 * head of the list and that means we only need to pull the current
> +	 * inode in and the next inode.  Inodes are unlinked when their
> +	 * reference count goes to zero, so we can overlap the xfs_iget() and
> +	 * xfs_irele() calls so we always have the first two inodes on the list
> +	 * in memory. Hence we can fake up the necessary in memory state for the
> +	 * unlink to "just work".
> +	 */
> +	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
> +		struct xfs_inode	*ip, *prev_ip = NULL;
> +		xfs_agino_t		agino, prev_agino = NULLAGINO;
> +
> +		agino = be32_to_cpu(agi->agi_unlinked[bucket]);
> +		while (agino != NULLAGINO) {
> +			ip = xlog_recover_get_one_iunlink(mp, agno, agino,
> +							  bucket);
> +			if (!ip) {
> +				/*
> +				 * something busted, but still got to release
> +				 * prev_ip, so make it look like it's at the end
> +				 * of the list before it gets released.
> +				 */
> +				error = -EFSCORRUPTED;
> +				if (prev_ip)
> +					prev_ip->i_next_unlinked = NULLAGINO;
> +				break;
> +			}
> +			if (prev_ip) {
> +				ip->i_prev_unlinked = prev_agino;
> +				xfs_irele(prev_ip);
> +			}

(little confused about this, why not xfs_irele the current ip and move it after
agino = ip->i_next_unlinked; ....)

And some comments about "[PATCH 4/4] xfs: introduce inode unlink log item.
(no need to raise another thread about this since I'm not fully reviewing that..)

> @@ -2185,6 +2136,7 @@ xfs_iunlink(
>  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
>  	int			error;
>  
> +	ASSERT(ip->i_next_unlinked == NULLAGINO);
>  	ASSERT(VFS_I(ip)->i_nlink == 0);
>  	ASSERT(VFS_I(ip)->i_mode != 0);
>  	trace_xfs_iunlink(ip);

this ASSERT is not necessary since some unreasonable numbers could be loaded
from disk for crafted images... (seems only for debugging use...)

Thanks,
Gao Xiang