Re: [PATCH 2/3] xfs: convert inode cache lookups to use RCU locking

"Paul E. McKenney" <paulmck@xxxxxxxxxxxxxxxxxx> · Tue, 14 Dec 2010 13:18:01 -0800

On Mon, Dec 13, 2010 at 12:32:36PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> With delayed logging greatly increasing the sustained parallelism of inode
> operations, the inode cache locking is showing significant read vs write
> contention when inode reclaim runs at the same time as lookups. There is
> also a lot more write lock acquistions than there are read locks (4:1 ratio)
> so the read locking is not really buying us much in the way of parallelism.
> 
> To avoid the read vs write contention, change the cache to use RCU locking on
> the read side. To avoid needing to RCU free every single inode, use the built
> in slab RCU freeing mechanism. This requires us to be able to detect lookups of
> freed inodes, so enÑure that ever freed inode has an inode number of zero and
> the XFS_IRECLAIM flag set. We already check the XFS_IRECLAIM flag in cache hit
> lookup path, but also add a check for a zero inode number as well.
> 
> We canthen convert all the read locking lockups to use RCU read side locking
> and hence remove all read side locking.

OK, so the xfs_inode uses straight RCU, and there fore cannot be freed and
immediately reallocated, while the inode itself uses SLAB_DESTROY_BY_RCU,
which does allow the inode to be freed and immediately reallocated,
correct?

Some questions and comments below.

							Thanx, Paul

> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> Reviewed-by: Alex Elder <aelder@xxxxxxx>
> ---
>  fs/xfs/linux-2.6/xfs_sync.c |   27 ++++++++++++++++-----
>  fs/xfs/xfs_iget.c           |   50 +++++++++++++++++++++++++++++++----------
>  fs/xfs/xfs_inode.c          |   52 +++++++++++++++++++++++++++++++++----------
>  3 files changed, 98 insertions(+), 31 deletions(-)
> 
> diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
> index afb0d7c..5ee02d7 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.c
> +++ b/fs/xfs/linux-2.6/xfs_sync.c
> @@ -53,14 +53,20 @@ xfs_inode_ag_walk_grab(
>  {
>  	struct inode		*inode = VFS_I(ip);
> 
> +	/* check for stale RCU freed inode */
> +	spin_lock(&ip->i_flags_lock);
> +	if (!ip->i_ino)
> +		goto out_unlock_noent;
> +
> +	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
> +	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
> +		goto out_unlock_noent;
> +	spin_unlock(&ip->i_flags_lock);
> +

OK, this works because the xfs_inode cannot be freed and reallocated (which
the RCU change should enforce).  It is not clear to me that the above
would work if using the SLAB_DESTROY_BY_RCU approach, at least not unless
some higher-level checks can reliably catch an inode changing identity
due to quick free and reallocation.

Also, all calls to xfs_inode_ag_walk_grab() must be in RCU read-side
critical sections...  I suggest a debug check for rcu_read_lock_held() to
catch any call paths that might have slipped through.  At first glance,
it appears that RCU is replacing some of ->pag_ici_lock, but I could
easily be mistaken.

>  	/* nothing to sync during shutdown */
>  	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
>  		return EFSCORRUPTED;
> 
> -	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
> -	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
> -		return ENOENT;
> -
>  	/* If we can't grab the inode, it must on it's way to reclaim. */
>  	if (!igrab(inode))
>  		return ENOENT;
> @@ -72,6 +78,10 @@ xfs_inode_ag_walk_grab(
> 
>  	/* inode is valid */
>  	return 0;
> +
> +out_unlock_noent:
> +	spin_unlock(&ip->i_flags_lock);
> +	return ENOENT;
>  }
> 
>  STATIC int
> @@ -98,12 +108,12 @@ restart:
>  		int		error = 0;
>  		int		i;
> 
> -		read_lock(&pag->pag_ici_lock);
> +		rcu_read_lock();
>  		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
>  					(void **)batch, first_index,
>  					XFS_LOOKUP_BATCH);
>  		if (!nr_found) {
> -			read_unlock(&pag->pag_ici_lock);
> +			rcu_read_unlock();
>  			break;
>  		}
> 
> @@ -129,7 +139,7 @@ restart:
>  		}
> 
>  		/* unlock now we've grabbed the inodes. */
> -		read_unlock(&pag->pag_ici_lock);
> +		rcu_read_unlock();
> 
>  		for (i = 0; i < nr_found; i++) {
>  			if (!batch[i])
> @@ -639,6 +649,9 @@ xfs_reclaim_inode_grab(
>  	struct xfs_inode	*ip,
>  	int			flags)
>  {
> +	/* check for stale RCU freed inode */
> +	if (!ip->i_ino)
> +		return 1;

Does this mean that we need to be under rcu_read_lock() here?  If not,
how do we keep the inode from really being freed out from under us?
(Again, if we do need to be under rcu_read_lock(), I highly recommend
a debug check for rcu_read_lock_held().)

>  	/*
>  	 * do some unlocked checks first to avoid unnecceary lock traffic.
> diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
> index 9fae475..1e3b035 100644
> --- a/fs/xfs/xfs_iget.c
> +++ b/fs/xfs/xfs_iget.c
> @@ -80,6 +80,7 @@ xfs_inode_alloc(
>  	ASSERT(atomic_read(&ip->i_pincount) == 0);
>  	ASSERT(!spin_is_locked(&ip->i_flags_lock));
>  	ASSERT(completion_done(&ip->i_flush));
> +	ASSERT(ip->i_ino == 0);
> 
>  	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
>  	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
> @@ -98,9 +99,6 @@ xfs_inode_alloc(
>  	ip->i_size = 0;
>  	ip->i_new_size = 0;
> 
> -	/* prevent anyone from using this yet */
> -	VFS_I(ip)->i_state = I_NEW;
> -
>  	return ip;
>  }
> 
> @@ -159,6 +157,16 @@ xfs_inode_free(
>  	ASSERT(!spin_is_locked(&ip->i_flags_lock));
>  	ASSERT(completion_done(&ip->i_flush));
> 
> +	/*
> +	 * Because we use RCU freeing we need to ensure the inode always
> +	 * appears to be reclaimed with an invalid inode number when in the
> +	 * free state. The ip->i_flags_lock provides the barrier against lookup
> +	 * races.
> +	 */
> +	spin_lock(&ip->i_flags_lock);
> +	ip->i_flags = XFS_IRECLAIM;
> +	ip->i_ino = 0;
> +	spin_unlock(&ip->i_flags_lock);

OK, good!

>  	call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);
>  }
> 
> @@ -169,14 +177,32 @@ static int
>  xfs_iget_cache_hit(
>  	struct xfs_perag	*pag,
>  	struct xfs_inode	*ip,
> +	xfs_ino_t		ino,
>  	int			flags,
> -	int			lock_flags) __releases(pag->pag_ici_lock)
> +	int			lock_flags) __releases(RCU)
>  {
>  	struct inode		*inode = VFS_I(ip);
>  	struct xfs_mount	*mp = ip->i_mount;
>  	int			error;
> 
> +	/*
> +	 * check for re-use of an inode within an RCU grace period due to the
> +	 * radix tree nodes not being updated yet. We monitor for this by
> +	 * setting the inode number to zero before freeing the inode structure.
> +	 * If the inode has been reallocated and set up, then the inode number
> +	 * will not match, so check for that, too.
> +	 */
>  	spin_lock(&ip->i_flags_lock);
> +	if (ip->i_ino != ino) {
> +		trace_xfs_iget_skip(ip);
> +		XFS_STATS_INC(xs_ig_frecycle);
> +		spin_unlock(&ip->i_flags_lock);
> +		rcu_read_unlock();
> +		/* Expire the grace period so we don't trip over it again. */
> +		synchronize_rcu();

Hmmm...  Interesting.  Wouldn't the fact that we acquired the same lock
that was held after removing the inode guarantee that an immediate retry
would manage not to find this same inode again?

If this is not the case, then readers finding it again will not be
protected by the RCU grace period, right?

In short, I don't understand why the synchronize_rcu() is needed.
If it is somehow helping, that sounds to me like it is covering up
a real bug that should be fixed separately.

> +		return EAGAIN;
> +	}
> +
> 
>  	/*
>  	 * If we are racing with another cache hit that is currently
> @@ -219,7 +245,7 @@ xfs_iget_cache_hit(
>  		ip->i_flags |= XFS_IRECLAIM;
> 
>  		spin_unlock(&ip->i_flags_lock);
> -		read_unlock(&pag->pag_ici_lock);
> +		rcu_read_unlock();
> 
>  		error = -inode_init_always(mp->m_super, inode);
>  		if (error) {
> @@ -227,7 +253,7 @@ xfs_iget_cache_hit(
>  			 * Re-initializing the inode failed, and we are in deep
>  			 * trouble.  Try to re-add it to the reclaim list.
>  			 */
> -			read_lock(&pag->pag_ici_lock);
> +			rcu_read_lock();
>  			spin_lock(&ip->i_flags_lock);
> 
>  			ip->i_flags &= ~XFS_INEW;
> @@ -261,7 +287,7 @@ xfs_iget_cache_hit(
> 
>  		/* We've got a live one. */
>  		spin_unlock(&ip->i_flags_lock);
> -		read_unlock(&pag->pag_ici_lock);
> +		rcu_read_unlock();
>  		trace_xfs_iget_hit(ip);
>  	}
> 
> @@ -275,7 +301,7 @@ xfs_iget_cache_hit(
> 
>  out_error:
>  	spin_unlock(&ip->i_flags_lock);
> -	read_unlock(&pag->pag_ici_lock);
> +	rcu_read_unlock();
>  	return error;
>  }
> 
> @@ -397,7 +423,7 @@ xfs_iget(
>  	xfs_agino_t	agino;
> 
>  	/* reject inode numbers outside existing AGs */
> -	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
> +	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)

For the above check to be safe, don't we need to already be in an
RCU read-side critical section?  Or is something else protecting us?

>  		return EINVAL;
> 
>  	/* get the perag structure and ensure that it's inode capable */
> @@ -406,15 +432,15 @@ xfs_iget(
> 
>  again:
>  	error = 0;
> -	read_lock(&pag->pag_ici_lock);
> +	rcu_read_lock();
>  	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> 
>  	if (ip) {
> -		error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
> +		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
>  		if (error)
>  			goto out_error_or_again;
>  	} else {
> -		read_unlock(&pag->pag_ici_lock);
> +		rcu_read_unlock();
>  		XFS_STATS_INC(xs_ig_missed);
> 
>  		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 108c7a0..43ffd90 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
>  		 */
>  		for (i = 0; i < ninodes; i++) {
>  retry:
> -			read_lock(&pag->pag_ici_lock);
> +			rcu_read_lock();
>  			ip = radix_tree_lookup(&pag->pag_ici_root,
>  					XFS_INO_TO_AGINO(mp, (inum + i)));
> 
> -			/* Inode not in memory or stale, nothing to do */
> -			if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
> -				read_unlock(&pag->pag_ici_lock);
> +			/* Inode not in memory, nothing to do */
> +			if (!ip) {
> +				rcu_read_unlock();
>  				continue;
>  			}
> 
>  			/*
> +			 * because this is an RCU protected lookup, we could
> +			 * find a recently freed or even reallocated inode

And here the inode might be immediately freed and then reallocated.
(In contrast, this cannot happen to the xfs_inode.)

> +			 * during the lookup. We need to check under the
> +			 * i_flags_lock for a valid inode here. Skip it if it
> +			 * is not valid, the wrong inode or stale.
> +			 */
> +			spin_lock(&ip->i_flags_lock);
> +			if (ip->i_ino != inum + i ||
> +			    __xfs_iflags_test(ip, XFS_ISTALE)) {
> +				spin_unlock(&ip->i_flags_lock);
> +				rcu_read_unlock();
> +				continue;
> +			}
> +			spin_unlock(&ip->i_flags_lock);
> +
> +			/*
>  			 * Don't try to lock/unlock the current inode, but we
>  			 * _cannot_ skip the other inodes that we did not find
>  			 * in the list attached to the buffer and are not
> @@ -2019,11 +2035,11 @@ retry:
>  			 */
>  			if (ip != free_ip &&
>  			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
> -				read_unlock(&pag->pag_ici_lock);
> +				rcu_read_unlock();
>  				delay(1);
>  				goto retry;
>  			}
> -			read_unlock(&pag->pag_ici_lock);
> +			rcu_read_unlock();
> 
>  			xfs_iflock(ip);
>  			xfs_iflags_set(ip, XFS_ISTALE);
> @@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
> 
>  	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
>  	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
> -	read_lock(&pag->pag_ici_lock);
> +	rcu_read_lock();
>  	/* really need a gang lookup range call here */
>  	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
>  					first_index, inodes_per_cluster);
> @@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
>  		iq = ilist[i];
>  		if (iq == ip)
>  			continue;
> -		/* if the inode lies outside this cluster, we're done. */
> -		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
> -			break;
> +
> +		/*
> +		 * because this is an RCU protected lookup, we could find a
> +		 * recently freed or even reallocated inode during the lookup.
> +		 * We need to check under the i_flags_lock for a valid inode
> +		 * here. Skip it if it is not valid or the wrong inode.
> +		 */
> +		spin_lock(&ip->i_flags_lock);
> +		if (!ip->i_ino ||
> +		    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
> +			spin_unlock(&ip->i_flags_lock);
> +			continue;
> +		}
> +		spin_unlock(&ip->i_flags_lock);
> +
>  		/*
>  		 * Do an un-protected check to see if the inode is dirty and
>  		 * is a candidate for flushing.  These checks will be repeated
> @@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
>  	}
> 
>  out_free:
> -	read_unlock(&pag->pag_ici_lock);
> +	rcu_read_unlock();
>  	kmem_free(ilist);
>  out_put:
>  	xfs_perag_put(pag);
> @@ -2704,7 +2732,7 @@ cluster_corrupt_out:
>  	 * Corruption detected in the clustering loop.  Invalidate the
>  	 * inode buffer and shut down the filesystem.
>  	 */
> -	read_unlock(&pag->pag_ici_lock);
> +	rcu_read_unlock();
>  	/*
>  	 * Clean up the buffer.  If it was B_DELWRI, just release it --
>  	 * brelse can handle it with no problems.  If not, shut down the
> -- 
> 1.7.2.3
> 

_______________________________________________
xfs mailing list
xfs@xxxxxxxxxxx
http://oss.sgi.com/mailman/listinfo/xfs