Re: [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root

NeilBrown <neilb@xxxxxxx> · Wed, 6 May 2015 08:26:28 +1000

On Tue, 5 May 2015 11:52:03 -0400 "J. Bruce Fields" <bfields@xxxxxxxxxxxx>
wrote:

> On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> > On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > > + * Unlike lookup_one_len, it should be called without the parent
> > > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > > + */
> > > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > > +				       struct dentry *base, int len)
> > > > +{
> > > > +	struct qstr this;
> > > > +	unsigned int c;
> > > > +	int err;
> > > > +	struct dentry *ret;
> > > > +
> > > > +	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> > > 
> > > Remove this line.
> > 
> > Whoops, thanks.
> > 
> > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > index a30e79900086..cc7995762190 100644
> > > > --- a/fs/nfsd/vfs.c
> > > > +++ b/fs/nfsd/vfs.c
> > > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > >  		host_err = PTR_ERR(dentry);
> > > >  		if (IS_ERR(dentry))
> > > >  			goto out_nfserr;
> > > > +		if (!S_ISREG(d_inode(dentry)->i_mode)) {
> > > 
> > > Got a crash here tested by pynfs,
> > 
> > OK, I guess I just forgot to take into account negative dentries.
> > Testing that now with (!(d_inode(dentry) && S_ISREG(..))).
> 
> And I also forgot nfs3xdr.c.
> 
> --b.
> 
> commit 6c942d342f9f
> Author: NeilBrown <neilb@xxxxxxx>
> Date:   Fri May 1 11:53:26 2015 +1000
> 
>     nfsd: don't hold i_mutex over userspace upcalls
>     
>     We need information about exports when crossing mountpoints during
>     lookup or NFSv4 readdir.  If we don't already have that information
>     cached, we may have to ask (and wait for) rpc.mountd.
>     
>     In both cases we currently hold the i_mutex on the parent of the
>     directory we're asking rpc.mountd about.  We've seen situations where
>     rpc.mountd performs some operation on that directory that tries to take
>     the i_mutex again, resulting in deadlock.
>     
>     With some care, we may be able to avoid that in rpc.mountd.  But it
>     seems better just to avoid holding a mutex while waiting on userspace.
>     
>     It appears that lookup_one_len is pretty much the only operation that
>     needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
>     something like
>     
>     	mutex_lock()
>     	lookup_one_len()
>     	mutex_unlock()
>     
>     In many cases though the lookup would have been cached and not required
>     the i_mutex, so it's more efficient to create a lookup_one_len() variant
>     that only takes the i_mutex when necessary.
>     
>     Signed-off-by: J. Bruce Fields <bfields@xxxxxxxxxx>
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 4a8d998b7274..0f554bf41dea 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
>   *
>   * Note that this routine is purely a helper for filesystem usage and should
>   * not be called by generic code.
> + *
> + * Must be called with the parented i_mutex held.

"parented"?

    * base->i_mutex must be held by caller.

??

>   */
>  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>  {
> @@ -2182,6 +2184,66 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>  }
>  EXPORT_SYMBOL(lookup_one_len);
>  
> +/**
> + * lookup_one_len - filesystem helper to lookup single pathname component
      lookup_one_len_unlocked - ..

> + * @name:	pathname component to lookup
> + * @base:	base directory to lookup from
> + * @len:	maximum length @len should be interpreted to
> + *
> + * Note that this routine is purely a helper for filesystem usage and should
> + * not be called by generic code.
> + *
> + * Unlike lookup_one_len, it should be called without the parent
> + * i_mutex held, and will take the i_mutex itself if necessary.
> + */
> +struct dentry *lookup_one_len_unlocked(const char *name,
> +				       struct dentry *base, int len)
> +{
> +	struct qstr this;
> +	unsigned int c;
> +	int err;
> +	struct dentry *ret;
> +
> +	this.name = name;
> +	this.len = len;
> +	this.hash = full_name_hash(name, len);
> +	if (!len)
> +		return ERR_PTR(-EACCES);
> +
> +	if (unlikely(name[0] == '.')) {
> +		if (len < 2 || (len == 2 && name[1] == '.'))
> +			return ERR_PTR(-EACCES);
> +	}
> +
> +	while (len--) {
> +		c = *(const unsigned char *)name++;
> +		if (c == '/' || c == '\0')
> +			return ERR_PTR(-EACCES);
> +	}
> +	/*
> +	 * See if the low-level filesystem might want
> +	 * to use its own hash..
> +	 */
> +	if (base->d_flags & DCACHE_OP_HASH) {
> +		int err = base->d_op->d_hash(base, &this);
> +		if (err < 0)
> +			return ERR_PTR(err);
> +	}
> +
> +	err = inode_permission(base->d_inode, MAY_EXEC);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	ret = __d_lookup(base, &this);
> +	if (ret)
> +		return ret;

__d_lookup() is a little too low level, as it doesn't call d_revalidate() and
it doesn't retry if the rename_lock seqlock says it should.

The latter doesn't really matter as we would fall through to the safe
mutex-protected version.
The former isn't much of an issue for most filesystems under nfsd, but still
needs to be handled to some extent.
Also, the comment for __d_lookup says

 * __d_lookup callers must be commented.

The simplest resolution would be:

	/* __d_lookup() is used to try to get a quick answer and avoid the
	 * mutex.  A false-negative does no harm.
	 */
	ret = __d_lookup(base, &this);
	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
		dput(ret);
		ret = NULL;
	}
	if (ret)
		return ret;

It probably wouldn't be too much harder to add the d_revalidate() call, and
call d_invalidate() if it returns zero.  

Maybe.

	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
		int err = d_revalidate(ret, 0);
		if (err == 0) {
			d_invalidate(ret);
			dput(ret);
			ret = NULL;
		} else if (err < 0) {
			dput(ret);
			return ERR_PTR(err);
		}
	}

> +	mutex_lock(&base->d_inode->i_mutex);
> +	ret =  __lookup_hash(&this, base, 0);
> +	mutex_unlock(&base->d_inode->i_mutex);
> +	return ret;
> +}
> +EXPORT_SYMBOL(lookup_one_len_unlocked);
> +
>  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
>  		 struct path *path, int *empty)
>  {
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index f6e7cbabac5a..01dcd494f781 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
>  		} else
>  			dchild = dget(dparent);
>  	} else
> -		dchild = lookup_one_len(name, dparent, namlen);
> +		dchild = lookup_one_len_unlocked(name, dparent, namlen);
>  	if (IS_ERR(dchild))
>  		return rv;
>  	if (d_mountpoint(dchild))
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 158badf945df..2c1adaa0bd2f 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
>  	__be32 nfserr;
>  	int ignore_crossmnt = 0;
>  
> -	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
> +	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
>  	if (IS_ERR(dentry))
>  		return nfserrno(PTR_ERR(dentry));
>  	if (d_really_is_negative(dentry)) {
>  		/*
> -		 * nfsd_buffered_readdir drops the i_mutex between
> -		 * readdir and calling this callback, leaving a window
> -		 * where this directory entry could have gone away.
> +		 * we're not holding the i_mutex here, so there's
> +		 * a window where this directory entry could have gone
> +		 * away.
>  		 */
>  		dput(dentry);
>  		return nfserr_noent;
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index a30e79900086..4accdb00976b 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		host_err = PTR_ERR(dentry);
>  		if (IS_ERR(dentry))
>  			goto out_nfserr;
> +		if (!(d_inode(dentry) && S_ISREG(d_inode(dentry)->i_mode))) {
> +			/*
> +			 * Never mind, we're not going to open this
> +			 * anyway.  And we don't want to hold it over
> +			 * the userspace upcalls in nfsd_mountpoint. */
> +			fh_unlock(fhp);
> +		}

You could avoid the test by just putting the fh_unlock(fhp) inside the 

		if (nfsd_mountpoint(dentry, exp)) {

block.  It might also be appropriate for nfsd_mountpoint to fail on
non-directories.

Up to you though.

Thanks.  Feel free to add
  *-by: NeilBrown <neilb@xxxxxxx>

as you see fit.

NeilBrown

>  		/*
>  		 * check if we have crossed a mount point ...
>  		 */
> @@ -1876,7 +1883,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  	offset = *offsetp;
>  
>  	while (1) {
> -		struct inode *dir_inode = file_inode(file);
>  		unsigned int reclen;
>  
>  		cdp->err = nfserr_eof; /* will be cleared on successful read */
> @@ -1895,15 +1901,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  		if (!size)
>  			break;
>  
> -		/*
> -		 * Various filldir functions may end up calling back into
> -		 * lookup_one_len() and the file system's ->lookup() method.
> -		 * These expect i_mutex to be held, as it would within readdir.
> -		 */
> -		host_err = mutex_lock_killable(&dir_inode->i_mutex);
> -		if (host_err)
> -			break;
> -
>  		de = (struct buffered_dirent *)buf.dirent;
>  		while (size > 0) {
>  			offset = de->offset;
> @@ -1920,7 +1917,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  			size -= reclen;
>  			de = (struct buffered_dirent *)((char *)de + reclen);
>  		}
> -		mutex_unlock(&dir_inode->i_mutex);
>  		if (size > 0) /* We bailed out early */
>  			break;
>  
> diff --git a/include/linux/namei.h b/include/linux/namei.h
> index c8990779f0c3..bb3a2f7cca67 100644
> --- a/include/linux/namei.h
> +++ b/include/linux/namei.h
> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
>  extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
>  
>  extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
>  
>  extern int follow_down_one(struct path *);
>  extern int follow_down(struct path *);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Attachment:
pgpDhIA5HoexD.pgp

Description: OpenPGP digital signature