Re: [PATCH 06/13] xfs: replace iunlink backref lookups with list lookups

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Aug 12, 2020 at 07:25:49PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> Now we have an in memory linked list of all the inodes on the
> unlinked list, use that to look up inodes in the list that we need
> to modify when adding or removing from the list.
> 
> This means we are no longer using the backref cache to maintain the
> previous inode lookups, so we can remove all that infrastructure
> now.
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> ---
>  fs/xfs/xfs_error.c |   2 -
>  fs/xfs/xfs_inode.c | 327 ++++++++-------------------------------------
>  fs/xfs/xfs_inode.h |   3 -
>  fs/xfs/xfs_mount.c |   5 -
>  fs/xfs/xfs_trace.h |   1 -
>  5 files changed, 54 insertions(+), 284 deletions(-)
> 
> diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
> index 7f6e20899473..829a89418830 100644
> --- a/fs/xfs/xfs_error.c
> +++ b/fs/xfs/xfs_error.c
> @@ -162,7 +162,6 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin,	XFS_ERRTAG_LOG_ITEM_PIN);
>  XFS_ERRORTAG_ATTR_RW(buf_lru_ref,	XFS_ERRTAG_BUF_LRU_REF);
>  XFS_ERRORTAG_ATTR_RW(force_repair,	XFS_ERRTAG_FORCE_SCRUB_REPAIR);
>  XFS_ERRORTAG_ATTR_RW(bad_summary,	XFS_ERRTAG_FORCE_SUMMARY_RECALC);
> -XFS_ERRORTAG_ATTR_RW(iunlink_fallback,	XFS_ERRTAG_IUNLINK_FALLBACK);
>  XFS_ERRORTAG_ATTR_RW(buf_ioerror,	XFS_ERRTAG_BUF_IOERROR);
>  
>  static struct attribute *xfs_errortag_attrs[] = {
> @@ -200,7 +199,6 @@ static struct attribute *xfs_errortag_attrs[] = {
>  	XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
>  	XFS_ERRORTAG_ATTR_LIST(force_repair),
>  	XFS_ERRORTAG_ATTR_LIST(bad_summary),
> -	XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
>  	XFS_ERRORTAG_ATTR_LIST(buf_ioerror),
>  	NULL,
>  };
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index dcf80ac51267..2c930de99561 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1893,196 +1893,29 @@ xfs_inactive(
>   * because we must walk that list to find the inode that points to the inode
>   * being removed from the unlinked hash bucket list.
>   *
> - * What if we modelled the unlinked list as a collection of records capturing
> - * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
> - * have a fast way to look up unlinked list predecessors, which avoids the
> - * slow list walk.  That's exactly what we do here (in-core) with a per-AG
> - * rhashtable.
> + * However, inodes that are on the unlinked list are also guaranteed to be in
> + * memory as they are loaded and then pinned in memory by whatever holds
> + * references to the inode to perform the unlink. Same goes for the O_TMPFILE
> + * usage of the unlinked list - those files are pinned in memory by an open file
> + * descriptor. Hence the inodes on the list are pinned in memory until they are
> + * removed from the list.
>   *
> - * Because this is a backref cache, we ignore operational failures since the
> - * iunlink code can fall back to the slow bucket walk.  The only errors that
> - * should bubble out are for obviously incorrect situations.
> + * That means we can simply use an in-memory double linked list to track inodes
> + * on the unlinked list. As we've removed the scalability problem resulting from
> + * removal on a single linked list requiring traversal, we also no longer use
> + * the on-disk hash to keep traversals short. We just use a single list on disk
> + * now, and track the previous inode in the list in memory.
>   *
> - * All users of the backref cache MUST hold the AGI buffer lock to serialize
> - * access or have otherwise provided for concurrency control.
> - */
> -
> -/* Capture a "X.next_unlinked = Y" relationship. */
> -struct xfs_iunlink {
> -	struct rhash_head	iu_rhash_head;
> -	xfs_agino_t		iu_agino;		/* X */
> -	xfs_agino_t		iu_next_unlinked;	/* Y */
> -};
> -
> -/* Unlinked list predecessor lookup hashtable construction */
> -static int
> -xfs_iunlink_obj_cmpfn(
> -	struct rhashtable_compare_arg	*arg,
> -	const void			*obj)
> -{
> -	const xfs_agino_t		*key = arg->key;
> -	const struct xfs_iunlink	*iu = obj;
> -
> -	if (iu->iu_next_unlinked != *key)
> -		return 1;
> -	return 0;
> -}
> -
> -static const struct rhashtable_params xfs_iunlink_hash_params = {
> -	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
> -	.key_len		= sizeof(xfs_agino_t),
> -	.key_offset		= offsetof(struct xfs_iunlink,
> -					   iu_next_unlinked),
> -	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
> -	.automatic_shrinking	= true,
> -	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
> -};
> -
> -/*
> - * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
> - * relation is found.
> + * To provide the guarantee that inodes are always on this in memory list, log
> + * recovery does what is necessary to populate the list sufficient to perform
> + * removal from the head of the list correctly. As such, we can now always rely
> + * on the in-memory list and if it differs from what we find on disk then we
> + * have a memory corruption problem or a software bug and so mismatches are now
> + * considered EFSCORRUPTION errors and are not recoverable.
> + *
> + * All users of the unlinked list MUST hold the AGI buffer lock to serialize
> + * access to the list.
>   */
> -static xfs_agino_t
> -xfs_iunlink_lookup_backref(
> -	struct xfs_perag	*pag,
> -	xfs_agino_t		agino)
> -{
> -	struct xfs_iunlink	*iu;
> -
> -	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
> -			xfs_iunlink_hash_params);
> -	return iu ? iu->iu_agino : NULLAGINO;
> -}
> -
> -/*
> - * Take ownership of an iunlink cache entry and insert it into the hash table.
> - * If successful, the entry will be owned by the cache; if not, it is freed.
> - * Either way, the caller does not own @iu after this call.
> - */
> -static int
> -xfs_iunlink_insert_backref(
> -	struct xfs_perag	*pag,
> -	struct xfs_iunlink	*iu)
> -{
> -	int			error;
> -
> -	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> -			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> -	/*
> -	 * Fail loudly if there already was an entry because that's a sign of
> -	 * corruption of in-memory data.  Also fail loudly if we see an error
> -	 * code we didn't anticipate from the rhashtable code.  Currently we
> -	 * only anticipate ENOMEM.
> -	 */
> -	if (error) {
> -		WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
> -		kmem_free(iu);
> -	}
> -	/*
> -	 * Absorb any runtime errors that aren't a result of corruption because
> -	 * this is a cache and we can always fall back to bucket list scanning.
> -	 */
> -	if (error != 0 && error != -EEXIST)
> -		error = 0;
> -	return error;
> -}
> -
> -/* Remember that @prev_agino.next_unlinked = @this_agino. */
> -static int
> -xfs_iunlink_add_backref(
> -	struct xfs_perag	*pag,
> -	xfs_agino_t		prev_agino,
> -	xfs_agino_t		this_agino)
> -{
> -	struct xfs_iunlink	*iu;
> -
> -	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
> -		return 0;
> -
> -	iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
> -	iu->iu_agino = prev_agino;
> -	iu->iu_next_unlinked = this_agino;
> -
> -	return xfs_iunlink_insert_backref(pag, iu);
> -}
> -
> -/*
> - * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
> - * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
> - * wasn't any such entry then we don't bother.
> - */
> -static int
> -xfs_iunlink_change_backref(
> -	struct xfs_perag	*pag,
> -	xfs_agino_t		agino,
> -	xfs_agino_t		next_unlinked)
> -{
> -	struct xfs_iunlink	*iu;
> -	int			error;
> -
> -	/* Look up the old entry; if there wasn't one then exit. */
> -	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
> -			xfs_iunlink_hash_params);
> -	if (!iu)
> -		return 0;
> -
> -	/*
> -	 * Remove the entry.  This shouldn't ever return an error, but if we
> -	 * couldn't remove the old entry we don't want to add it again to the
> -	 * hash table, and if the entry disappeared on us then someone's
> -	 * violated the locking rules and we need to fail loudly.  Either way
> -	 * we cannot remove the inode because internal state is or would have
> -	 * been corrupt.
> -	 */
> -	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
> -			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> -	if (error)
> -		return error;
> -
> -	/* If there is no new next entry just free our item and return. */
> -	if (next_unlinked == NULLAGINO) {
> -		kmem_free(iu);
> -		return 0;
> -	}
> -
> -	/* Update the entry and re-add it to the hash table. */
> -	iu->iu_next_unlinked = next_unlinked;
> -	return xfs_iunlink_insert_backref(pag, iu);
> -}
> -
> -/* Set up the in-core predecessor structures. */
> -int
> -xfs_iunlink_init(
> -	struct xfs_perag	*pag)
> -{
> -	return rhashtable_init(&pag->pagi_unlinked_hash,
> -			&xfs_iunlink_hash_params);
> -}
> -
> -/* Free the in-core predecessor structures. */
> -static void
> -xfs_iunlink_free_item(
> -	void			*ptr,
> -	void			*arg)
> -{
> -	struct xfs_iunlink	*iu = ptr;
> -	bool			*freed_anything = arg;
> -
> -	*freed_anything = true;
> -	kmem_free(iu);
> -}
> -
> -void
> -xfs_iunlink_destroy(
> -	struct xfs_perag	*pag)
> -{
> -	bool			freed_anything = false;
> -
> -	rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
> -			xfs_iunlink_free_item, &freed_anything);
> -
> -	ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
> -}
>  
>  /*
>   * Point the AGI unlinked bucket at an inode and log the results.  The caller
> @@ -2221,6 +2054,7 @@ xfs_iunlink_insert_inode(
>  {
>  	struct xfs_mount	*mp = tp->t_mountp;
>  	struct xfs_agi		*agi;
> +	struct xfs_inode	*nip;
>  	xfs_agino_t		next_agino;
>  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
>  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> @@ -2242,9 +2076,13 @@ xfs_iunlink_insert_inode(
>  		return -EFSCORRUPTED;
>  	}
>  
> -	if (next_agino != NULLAGINO) {
> +	nip = list_first_entry_or_null(&agibp->b_pag->pag_ici_unlink_list,
> +					struct xfs_inode, i_unlink);
> +	if (nip) {
>  		xfs_agino_t		old_agino;
>  
> +		ASSERT(next_agino == XFS_INO_TO_AGINO(mp, nip->i_ino));
> +
>  		/*
>  		 * There is already another inode in the bucket, so point this
>  		 * inode to the current head of the list.
> @@ -2254,14 +2092,8 @@ xfs_iunlink_insert_inode(
>  		if (error)
>  			return error;
>  		ASSERT(old_agino == NULLAGINO);
> -
> -		/*
> -		 * agino has been unlinked, add a backref from the next inode
> -		 * back to agino.
> -		 */
> -		error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino);
> -		if (error)
> -			return error;
> +	} else {
> +		ASSERT(next_agino == NULLAGINO);
>  	}
>  
>  	/* Point the head of the list to point to this inode. */
> @@ -2354,70 +2186,24 @@ xfs_iunlink_map_prev(
>  	xfs_agnumber_t		agno,
>  	xfs_agino_t		head_agino,
>  	xfs_agino_t		target_agino,
> -	xfs_agino_t		*agino,
> +	xfs_agino_t		agino,
>  	struct xfs_imap		*imap,
>  	struct xfs_dinode	**dipp,
>  	struct xfs_buf		**bpp,
>  	struct xfs_perag	*pag)
>  {
> -	struct xfs_mount	*mp = tp->t_mountp;
> -	xfs_agino_t		next_agino;
>  	int			error;
>  
>  	ASSERT(head_agino != target_agino);
>  	*bpp = NULL;
>  
> -	/* See if our backref cache can find it faster. */
> -	*agino = xfs_iunlink_lookup_backref(pag, target_agino);
> -	if (*agino != NULLAGINO) {
> -		error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
> -		if (error)
> -			return error;
> -
> -		if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
> -			return 0;
> -
> -		/*
> -		 * If we get here the cache contents were corrupt, so drop the
> -		 * buffer and fall back to walking the bucket list.
> -		 */
> -		xfs_trans_brelse(tp, *bpp);
> -		*bpp = NULL;
> -		WARN_ON_ONCE(1);
> -	}
> -
> -	trace_xfs_iunlink_map_prev_fallback(mp, agno);
> -
> -	/* Otherwise, walk the entire bucket until we find it. */
> -	next_agino = head_agino;
> -	while (next_agino != target_agino) {
> -		xfs_agino_t	unlinked_agino;
> -
> -		if (*bpp)
> -			xfs_trans_brelse(tp, *bpp);
> -
> -		*agino = next_agino;
> -		error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
> -				bpp);
> -		if (error)
> -			return error;
> -
> -		unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
> -		/*
> -		 * Make sure this pointer is valid and isn't an obvious
> -		 * infinite loop.
> -		 */
> -		if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
> -		    next_agino == unlinked_agino) {
> -			XFS_CORRUPTION_ERROR(__func__,
> -					XFS_ERRLEVEL_LOW, mp,
> -					*dipp, sizeof(**dipp));
> -			error = -EFSCORRUPTED;
> -			return error;
> -		}
> -		next_agino = unlinked_agino;
> -	}
> +	ASSERT(agino != NULLAGINO);
> +	error = xfs_iunlink_map_ino(tp, agno, agino, imap, dipp, bpp);
> +	if (error)
> +		return error;
>  
> +	if (be32_to_cpu((*dipp)->di_next_unlinked) != target_agino)
> +		return -EFSCORRUPTED;

Why drop the corruption report here?

--D

>  	return 0;
>  }
>  
> @@ -2461,27 +2247,31 @@ xfs_iunlink_remove_inode(
>  	if (error)
>  		return error;
>  
> -	/*
> -	 * If there was a backref pointing from the next inode back to this
> -	 * one, remove it because we've removed this inode from the list.
> -	 *
> -	 * Later, if this inode was in the middle of the list we'll update
> -	 * this inode's backref to point from the next inode.
> -	 */
> -	if (next_agino != NULLAGINO) {
> -		error = xfs_iunlink_change_backref(agibp->b_pag, next_agino,
> -				NULLAGINO);
> -		if (error)
> -			return error;
> +#ifdef DEBUG
> +	{
> +	struct xfs_inode *nip = list_next_entry(ip, i_unlink);
> +	if (nip)
> +		ASSERT(next_agino == XFS_INO_TO_AGINO(mp, nip->i_ino));
> +	else
> +		ASSERT(next_agino == NULLAGINO);
>  	}
> +#endif
> +
> +	if (ip != list_first_entry(&agibp->b_pag->pag_ici_unlink_list,
> +					struct xfs_inode, i_unlink)) {
>  
> -	if (head_agino != agino) {
> +		struct xfs_inode *pip;
>  		struct xfs_imap	imap;
>  		xfs_agino_t	prev_agino;
>  
> +		ASSERT(head_agino != agino);
> +
> +		pip = list_prev_entry(ip, i_unlink);
> +		prev_agino = XFS_INO_TO_AGINO(mp, pip->i_ino);
> +
>  		/* We need to search the list for the inode being freed. */
>  		error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
> -				&prev_agino, &imap, &last_dip, &last_ibp,
> +				prev_agino, &imap, &last_dip, &last_ibp,
>  				agibp->b_pag);
>  		if (error)
>  			return error;
> @@ -2490,16 +2280,7 @@ xfs_iunlink_remove_inode(
>  		xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
>  				last_dip, &imap, next_agino);
>  
> -		/*
> -		 * Now we deal with the backref for this inode.  If this inode
> -		 * pointed at a real inode, change the backref that pointed to
> -		 * us to point to our old next.  If this inode was the end of
> -		 * the list, delete the backref that pointed to us.  Note that
> -		 * change_backref takes care of deleting the backref if
> -		 * next_agino is NULLAGINO.
> -		 */
> -		return xfs_iunlink_change_backref(agibp->b_pag, agino,
> -				next_agino);
> +		return 0;
>  	}
>  
>  	/* Point the head of the list to the next unlinked inode. */
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 73f36908a1ce..7f8fbb7c8594 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -464,9 +464,6 @@ extern struct kmem_zone	*xfs_inode_zone;
>  /* The default CoW extent size hint. */
>  #define XFS_DEFAULT_COWEXTSZ_HINT 32
>  
> -int xfs_iunlink_init(struct xfs_perag *pag);
> -void xfs_iunlink_destroy(struct xfs_perag *pag);
> -
>  void xfs_end_io(struct work_struct *work);
>  
>  int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 2def15297a5f..f28c969af272 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -146,7 +146,6 @@ xfs_free_perag(
>  		spin_unlock(&mp->m_perag_lock);
>  		ASSERT(pag);
>  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> -		xfs_iunlink_destroy(pag);
>  		xfs_buf_hash_destroy(pag);
>  		call_rcu(&pag->rcu_head, __xfs_free_perag);
>  	}
> @@ -224,9 +223,6 @@ xfs_initialize_perag(
>  		/* first new pag is fully initialized */
>  		if (first_initialised == NULLAGNUMBER)
>  			first_initialised = index;
> -		error = xfs_iunlink_init(pag);
> -		if (error)
> -			goto out_hash_destroy;
>  		spin_lock_init(&pag->pag_state_lock);
>  	}
>  
> @@ -249,7 +245,6 @@ xfs_initialize_perag(
>  		if (!pag)
>  			break;
>  		xfs_buf_hash_destroy(pag);
> -		xfs_iunlink_destroy(pag);
>  		kmem_free(pag);
>  	}
>  	return error;
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index abb1d859f226..acddc60f6d88 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3514,7 +3514,6 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \
>  	TP_ARGS(ip))
>  DEFINE_AGINODE_EVENT(xfs_iunlink);
>  DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
> -DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
>  
>  DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
>  	TP_PROTO(struct xfs_mount *mp, unsigned int flags),
> -- 
> 2.26.2.761.g0e0b3e54be
> 



[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux