Re: [PATCH -V5 3/8] vfs: Add open by file handle support

"Aneesh Kumar K. V" <aneesh.kumar@xxxxxxxxxxxxxxxxxx> · Tue, 27 Apr 2010 11:40:34 +0530

On Tue, 27 Apr 2010 11:05:45 +1000, Neil Brown <neilb@xxxxxxx> wrote:
> On Mon, 26 Apr 2010 23:04:01 +0530
> "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxxxxxxx> wrote:
> 
> > Acked-by: Serge Hallyn <serue@xxxxxxxxxx>
> > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
> > ---
> >  fs/filesystems.c              |   32 +++++++++-
> >  fs/namei.c                    |   24 -------
> >  fs/namespace.c                |   38 +++++++++++
> >  fs/open.c                     |  148 +++++++++++++++++++++++++++++++++++++++++
> >  fs/pnode.c                    |    2 +-
> >  include/linux/mnt_namespace.h |    2 +
> >  include/linux/namei.h         |   24 +++++++
> >  7 files changed, 244 insertions(+), 26 deletions(-)
> > 
> > diff --git a/fs/filesystems.c b/fs/filesystems.c
> > index 68ba492..743d36e 100644
> > --- a/fs/filesystems.c
> > +++ b/fs/filesystems.c
> > @@ -281,5 +281,35 @@ struct file_system_type *get_fs_type(const char *name)
> >  	}
> >  	return fs;
> >  }
> > -
> >  EXPORT_SYMBOL(get_fs_type);
> > +
> > +struct super_block *fs_get_sb(struct uuid *fsid)
> > +{
> > +	struct uuid *this_fsid;
> > +	struct file_system_type *fs_type;
> > +	struct super_block *sb, *found_sb = NULL;
> > +
> > +	read_lock(&file_systems_lock);
> > +	fs_type = file_systems;
> > +	while (fs_type) {
> 
> You've open-coded a for-loop here...
> Why not:
>         for (fs_type = file_systems ; fs_type ; fs_type = fs_type->next) {
> ??

Fixed

> 
> 
> > +		spin_lock(&sb_lock);
> > +		list_for_each_entry(sb, &fs_type->fs_supers, s_instances) {
> > +			if (!sb->s_op->get_fsid)
> > +				continue;
> > +			this_fsid = sb->s_op->get_fsid(sb);
> > +			if (!memcmp(fsid->uuid, this_fsid->uuid,
> > +					sizeof(this_fsid->uuid))) {
> > +				/* found the matching super_block */
> > +				atomic_inc(&sb->s_active);
> > +				found_sb = sb;
> > +				spin_unlock(&sb_lock);
> > +				goto out;
> > +			}
> > +		}
> > +		spin_unlock(&sb_lock);
> > +		fs_type = fs_type->next;
> > +	}
> > +out:
> > +	read_unlock(&file_systems_lock);
> > +	return found_sb;
> > +}
> > diff --git a/fs/namei.c b/fs/namei.c
> > index a7dce91..a18711e 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -1521,30 +1521,6 @@ out_unlock:
> >  	return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
> >  }
> >  
> > -/*
> > - * Note that while the flag value (low two bits) for sys_open means:
> > - *	00 - read-only
> > - *	01 - write-only
> > - *	10 - read-write
> > - *	11 - special
> > - * it is changed into
> > - *	00 - no permissions needed
> > - *	01 - read-permission
> > - *	10 - write-permission
> > - *	11 - read-write
> > - * for the internal routines (ie open_namei()/follow_link() etc)
> > - * This is more logical, and also allows the 00 "no perm needed"
> > - * to be used for symlinks (where the permissions are checked
> > - * later).
> > - *
> > -*/
> > -static inline int open_to_namei_flags(int flag)
> > -{
> > -	if ((flag+1) & O_ACCMODE)
> > -		flag++;
> > -	return flag;
> > -}
> > -
> >  static int open_will_truncate(int flag, struct inode *inode)
> >  {
> >  	/*
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index 8174c8a..6168526 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -2364,3 +2364,41 @@ void put_mnt_ns(struct mnt_namespace *ns)
> >  	kfree(ns);
> >  }
> >  EXPORT_SYMBOL(put_mnt_ns);
> > +
> > +/*
> > + * Get any vfsmount mapping the superblock in the
> > + * task namespace
> > + */
> > +struct vfsmount *fs_get_vfsmount(struct task_struct *task,
> > +				struct super_block *sb)
> > +{
> > +	struct nsproxy *nsp;
> > +	struct list_head *mount_list;
> > +	struct mnt_namespace *ns = NULL;
> > +	struct vfsmount *mnt, *sb_mnt = NULL;
> > +
> > +	rcu_read_lock();
> > +	nsp = task_nsproxy(task);
> > +	if (nsp) {
> > +		ns = nsp->mnt_ns;
> > +		if (ns)
> > +			get_mnt_ns(ns);
> > +	}
> > +	rcu_read_unlock();
> > +	if (!ns)
> > +		return NULL;
> > +	down_read(&namespace_sem);
> > +	list_for_each(mount_list, &ns->list) {
> > +		mnt = list_entry(mount_list, struct vfsmount, mnt_list);
> > +		if (mnt->mnt_sb == sb) {
> > +			/* found the matching super block */
> > +			sb_mnt = mnt;
> > +			mntget(sb_mnt);
> > +			break;
> > +		}
> > +	}
> > +	up_read(&namespace_sem);
> > +
> > +	put_mnt_ns(ns);
> > +	return sb_mnt;
> > +}
> 
> If a task has the same filesystem mounted several times in it's namespace
> (via "mount --bind") you just return any arbitrary one (the first).
> 
> Suppose that the filesystem does appear twice, and that in one appearance a
> particular directory is mounted on, while in another appearance that
> directory is not mounted on.  Suppose that the mounted-on appearance is
> listed first in ns->list
> 
> Now we use a series of "name_to_handle" / "open_by_handle" calls to descend
> through the second appearance to something beneath that directory which (in
> the first appearance) is mounted-on.
> This will not work as you will actually be descending the first appearance of
> the filesystem.
> I really think you should leave the choice of filesystem/mountpoint to
> user-space.
> 

Will comment on this on a separate thread.

> 
> > diff --git a/fs/open.c b/fs/open.c
> > index e9aae5c..b87fa32 100644
> > --- a/fs/open.c
> > +++ b/fs/open.c
> > @@ -1306,3 +1306,151 @@ SYSCALL_DEFINE2(lname_to_handle, const char __user *, name,
> >  	asmlinkage_protect(2, ret, name, handle);
> >  	return ret;
> >  }
> > +
> > +static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
> > +{
> > +	return 1;
> > +}
> > +
> > +static struct dentry *handle_to_dentry(struct vfsmount *mnt,
> > +				struct file_handle *handle)
> > +{
> > +	int handle_size;
> > +	struct dentry *dentry;
> > +
> > +	/* change the handle size to multiple of sizeof(u32) */
> > +	handle_size = handle->handle_size >> 2;
> > +	dentry = exportfs_decode_fh(mnt, (struct fid *)handle->f_handle,
> > +					handle_size, handle->handle_type,
> > +					vfs_dentry_acceptable, NULL);
> > +	return dentry;
> > +}
> > +
> > +long do_sys_open_by_handle(struct file_handle __user *ufh, int flags)
> > +{
> > +	int fd;
> > +	int retval = 0;
> > +	int d_flags  = flags;
> > +	struct file *filp;
> > +	struct vfsmount *mnt;
> > +	struct inode *inode;
> > +	struct dentry *dentry;
> > +	struct super_block *sb;
> > +	struct file_handle f_handle;
> > +	struct file_handle *handle = NULL;
> > +
> > +	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
> > +		return  -EFAULT;
> > +
> > +	if ((f_handle.handle_size > MAX_HANDLE_SZ) ||
> > +		(f_handle.handle_size <= 0))
> > +		return -EINVAL;
> > +
> > +	if (!capable(CAP_DAC_OVERRIDE))
> > +		return -EPERM;
> > +
> > +	sb = fs_get_sb(&f_handle.fsid);
> > +	if (!sb)
> > +		return -ESTALE;
> > +	/*
> > +	 * Find the vfsmount for this superblock in the
> > +	 * current namespace
> > +	 */
> > +	mnt = fs_get_vfsmount(current, sb);
> > +	if (!mnt) {
> > +		retval = -ESTALE;
> > +		goto out_sb;
> > +	}
> > +
> > +	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_size,
> > +			GFP_KERNEL);
> > +	if (!handle) {
> > +		retval =  -ENOMEM;
> > +		goto out_mnt;
> > +	}
> > +	/* copy the full handle */
> > +	if (copy_from_user(handle, ufh,
> > +				sizeof(struct file_handle) +
> > +				f_handle.handle_size)) {
> > +		retval = -EFAULT;
> > +		goto out_mnt;
> > +	}
> > +
> > +	dentry = handle_to_dentry(mnt, handle);
> > +	if (IS_ERR(dentry)) {
> > +		retval = PTR_ERR(dentry);
> > +		goto out_mnt;
> > +	}
> > +
> > +	inode = dentry->d_inode;
> > +
> > +	flags  = open_to_namei_flags(flags);
> > +	/* O_TRUNC implies we need access checks for write permissions */
> > +	if (flags & O_TRUNC)
> > +		flags |= MAY_WRITE;
> > +
> > +	if ((!(flags & O_APPEND) || (flags & O_TRUNC)) &&
> > +		(flags & FMODE_WRITE) && IS_APPEND(inode)) {
> > +		retval = -EPERM;
> > +		goto out_err;
> > +	}
> > +
> > +	if ((flags & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
> > +		retval = -EACCES;
> > +		goto out_err;
> > +	}
> > +
> > +	/* Can't write directories. */
> > +	if (S_ISDIR(inode->i_mode) && (flags & FMODE_WRITE)) {
> > +		retval = -EISDIR;
> > +		goto out_err;
> > +	}
> > +
> > +	fd = get_unused_fd();
> 
> You want to use alloc_fd() (or get_unused_fd_flags) rather than get_unused_fd
> so that the flags can be passed though and, in particular, so O_CLOEXEC will
> be honoured.

Fixed by saying fd = get_unused_fd_flags(d_flags);

> 
> 
> > +	if (fd < 0) {
> > +		retval = fd;
> > +		goto out_err;
> > +	}
> > +
> > +	filp = dentry_open(dentry, mntget(mnt),
> > +			d_flags, current_cred());
> 
> dentry_open consumed the reference to dentry, so ...
> 
> > +	if (IS_ERR(filp)) {
> > +		put_unused_fd(fd);
> > +		retval =  PTR_ERR(filp);
> > +		goto out_err;
> 
> The dput at 'out_err' will put it one time too many.
> I think the best fix would be to pass dget(dentry) to dentry_open, then ....
> 
> > +	}
> > +
> > +	if (inode->i_mode & S_IFREG) {
> > +		filp->f_flags |= O_NOATIME;
> > +		filp->f_mode |= FMODE_NOCMTIME;
> > +	}
> > +	fsnotify_open(filp->f_path.dentry);
> > +	fd_install(fd, filp);
> > +	retval = fd;
> > +	goto out_mnt;
> 
> ... this goto (Which is rather ugly to me) would not be needed.
> 

Fixed

Thanks for the review
-aneesh
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html