On Mon, 26 Apr 2010 23:04:01 +0530 "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxxxxxxx> wrote: > Acked-by: Serge Hallyn <serue@xxxxxxxxxx> > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx> > --- > fs/filesystems.c | 32 +++++++++- > fs/namei.c | 24 ------- > fs/namespace.c | 38 +++++++++++ > fs/open.c | 148 +++++++++++++++++++++++++++++++++++++++++ > fs/pnode.c | 2 +- > include/linux/mnt_namespace.h | 2 + > include/linux/namei.h | 24 +++++++ > 7 files changed, 244 insertions(+), 26 deletions(-) > > diff --git a/fs/filesystems.c b/fs/filesystems.c > index 68ba492..743d36e 100644 > --- a/fs/filesystems.c > +++ b/fs/filesystems.c > @@ -281,5 +281,35 @@ struct file_system_type *get_fs_type(const char *name) > } > return fs; > } > - > EXPORT_SYMBOL(get_fs_type); > + > +struct super_block *fs_get_sb(struct uuid *fsid) > +{ > + struct uuid *this_fsid; > + struct file_system_type *fs_type; > + struct super_block *sb, *found_sb = NULL; > + > + read_lock(&file_systems_lock); > + fs_type = file_systems; > + while (fs_type) { You've open-coded a for-loop here... Why not: for (fs_type = file_systems ; fs_type ; fs_type = fs_type->next) { ?? > + spin_lock(&sb_lock); > + list_for_each_entry(sb, &fs_type->fs_supers, s_instances) { > + if (!sb->s_op->get_fsid) > + continue; > + this_fsid = sb->s_op->get_fsid(sb); > + if (!memcmp(fsid->uuid, this_fsid->uuid, > + sizeof(this_fsid->uuid))) { > + /* found the matching super_block */ > + atomic_inc(&sb->s_active); > + found_sb = sb; > + spin_unlock(&sb_lock); > + goto out; > + } > + } > + spin_unlock(&sb_lock); > + fs_type = fs_type->next; > + } > +out: > + read_unlock(&file_systems_lock); > + return found_sb; > +} > diff --git a/fs/namei.c b/fs/namei.c > index a7dce91..a18711e 100644 > --- a/fs/namei.c > +++ b/fs/namei.c > @@ -1521,30 +1521,6 @@ out_unlock: > return may_open(&nd->path, 0, open_flag & ~O_TRUNC); > } > > -/* > - * Note that while the flag value (low two bits) for sys_open means: > - * 00 - read-only > - * 01 - write-only > - * 10 - read-write > - * 11 - special > - * it is changed into > - * 00 - no permissions needed > - * 01 - read-permission > - * 10 - write-permission > - * 11 - read-write > - * for the internal routines (ie open_namei()/follow_link() etc) > - * This is more logical, and also allows the 00 "no perm needed" > - * to be used for symlinks (where the permissions are checked > - * later). > - * > -*/ > -static inline int open_to_namei_flags(int flag) > -{ > - if ((flag+1) & O_ACCMODE) > - flag++; > - return flag; > -} > - > static int open_will_truncate(int flag, struct inode *inode) > { > /* > diff --git a/fs/namespace.c b/fs/namespace.c > index 8174c8a..6168526 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -2364,3 +2364,41 @@ void put_mnt_ns(struct mnt_namespace *ns) > kfree(ns); > } > EXPORT_SYMBOL(put_mnt_ns); > + > +/* > + * Get any vfsmount mapping the superblock in the > + * task namespace > + */ > +struct vfsmount *fs_get_vfsmount(struct task_struct *task, > + struct super_block *sb) > +{ > + struct nsproxy *nsp; > + struct list_head *mount_list; > + struct mnt_namespace *ns = NULL; > + struct vfsmount *mnt, *sb_mnt = NULL; > + > + rcu_read_lock(); > + nsp = task_nsproxy(task); > + if (nsp) { > + ns = nsp->mnt_ns; > + if (ns) > + get_mnt_ns(ns); > + } > + rcu_read_unlock(); > + if (!ns) > + return NULL; > + down_read(&namespace_sem); > + list_for_each(mount_list, &ns->list) { > + mnt = list_entry(mount_list, struct vfsmount, mnt_list); > + if (mnt->mnt_sb == sb) { > + /* found the matching super block */ > + sb_mnt = mnt; > + mntget(sb_mnt); > + break; > + } > + } > + up_read(&namespace_sem); > + > + put_mnt_ns(ns); > + return sb_mnt; > +} If a task has the same filesystem mounted several times in it's namespace (via "mount --bind") you just return any arbitrary one (the first). Suppose that the filesystem does appear twice, and that in one appearance a particular directory is mounted on, while in another appearance that directory is not mounted on. Suppose that the mounted-on appearance is listed first in ns->list Now we use a series of "name_to_handle" / "open_by_handle" calls to descend through the second appearance to something beneath that directory which (in the first appearance) is mounted-on. This will not work as you will actually be descending the first appearance of the filesystem. I really think you should leave the choice of filesystem/mountpoint to user-space. > diff --git a/fs/open.c b/fs/open.c > index e9aae5c..b87fa32 100644 > --- a/fs/open.c > +++ b/fs/open.c > @@ -1306,3 +1306,151 @@ SYSCALL_DEFINE2(lname_to_handle, const char __user *, name, > asmlinkage_protect(2, ret, name, handle); > return ret; > } > + > +static int vfs_dentry_acceptable(void *context, struct dentry *dentry) > +{ > + return 1; > +} > + > +static struct dentry *handle_to_dentry(struct vfsmount *mnt, > + struct file_handle *handle) > +{ > + int handle_size; > + struct dentry *dentry; > + > + /* change the handle size to multiple of sizeof(u32) */ > + handle_size = handle->handle_size >> 2; > + dentry = exportfs_decode_fh(mnt, (struct fid *)handle->f_handle, > + handle_size, handle->handle_type, > + vfs_dentry_acceptable, NULL); > + return dentry; > +} > + > +long do_sys_open_by_handle(struct file_handle __user *ufh, int flags) > +{ > + int fd; > + int retval = 0; > + int d_flags = flags; > + struct file *filp; > + struct vfsmount *mnt; > + struct inode *inode; > + struct dentry *dentry; > + struct super_block *sb; > + struct file_handle f_handle; > + struct file_handle *handle = NULL; > + > + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) > + return -EFAULT; > + > + if ((f_handle.handle_size > MAX_HANDLE_SZ) || > + (f_handle.handle_size <= 0)) > + return -EINVAL; > + > + if (!capable(CAP_DAC_OVERRIDE)) > + return -EPERM; > + > + sb = fs_get_sb(&f_handle.fsid); > + if (!sb) > + return -ESTALE; > + /* > + * Find the vfsmount for this superblock in the > + * current namespace > + */ > + mnt = fs_get_vfsmount(current, sb); > + if (!mnt) { > + retval = -ESTALE; > + goto out_sb; > + } > + > + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_size, > + GFP_KERNEL); > + if (!handle) { > + retval = -ENOMEM; > + goto out_mnt; > + } > + /* copy the full handle */ > + if (copy_from_user(handle, ufh, > + sizeof(struct file_handle) + > + f_handle.handle_size)) { > + retval = -EFAULT; > + goto out_mnt; > + } > + > + dentry = handle_to_dentry(mnt, handle); > + if (IS_ERR(dentry)) { > + retval = PTR_ERR(dentry); > + goto out_mnt; > + } > + > + inode = dentry->d_inode; > + > + flags = open_to_namei_flags(flags); > + /* O_TRUNC implies we need access checks for write permissions */ > + if (flags & O_TRUNC) > + flags |= MAY_WRITE; > + > + if ((!(flags & O_APPEND) || (flags & O_TRUNC)) && > + (flags & FMODE_WRITE) && IS_APPEND(inode)) { > + retval = -EPERM; > + goto out_err; > + } > + > + if ((flags & FMODE_WRITE) && IS_IMMUTABLE(inode)) { > + retval = -EACCES; > + goto out_err; > + } > + > + /* Can't write directories. */ > + if (S_ISDIR(inode->i_mode) && (flags & FMODE_WRITE)) { > + retval = -EISDIR; > + goto out_err; > + } > + > + fd = get_unused_fd(); You want to use alloc_fd() (or get_unused_fd_flags) rather than get_unused_fd so that the flags can be passed though and, in particular, so O_CLOEXEC will be honoured. > + if (fd < 0) { > + retval = fd; > + goto out_err; > + } > + > + filp = dentry_open(dentry, mntget(mnt), > + d_flags, current_cred()); dentry_open consumed the reference to dentry, so ... > + if (IS_ERR(filp)) { > + put_unused_fd(fd); > + retval = PTR_ERR(filp); > + goto out_err; The dput at 'out_err' will put it one time too many. I think the best fix would be to pass dget(dentry) to dentry_open, then .... > + } > + > + if (inode->i_mode & S_IFREG) { > + filp->f_flags |= O_NOATIME; > + filp->f_mode |= FMODE_NOCMTIME; > + } > + fsnotify_open(filp->f_path.dentry); > + fd_install(fd, filp); > + retval = fd; > + goto out_mnt; ... this goto (Which is rather ugly to me) would not be needed. NeilBrown > + > +out_err: > + dput(dentry); > +out_mnt: > + kfree(handle); > + mntput(mnt); > +out_sb: > + deactivate_super(sb); > + > + return retval; > +} > + > +SYSCALL_DEFINE2(open_by_handle, struct file_handle __user *, handle, > + int, flags) > +{ > + long ret; > + > + if (force_o_largefile()) > + flags |= O_LARGEFILE; > + > + ret = do_sys_open_by_handle(handle, flags); > + > + /* avoid REGPARM breakage on x86: */ > + asmlinkage_protect(2, ret, handle, flags); > + return ret; > +} > diff --git a/fs/pnode.c b/fs/pnode.c > index 5cc564a..9f6d12d 100644 > --- a/fs/pnode.c > +++ b/fs/pnode.c > @@ -6,9 +6,9 @@ > * Author : Ram Pai (linuxram@xxxxxxxxxx) > * > */ > +#include <linux/fs.h> > #include <linux/mnt_namespace.h> > #include <linux/mount.h> > -#include <linux/fs.h> > #include "internal.h" > #include "pnode.h" > > diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h > index 0b89efc..d363ecc 100644 > --- a/include/linux/mnt_namespace.h > +++ b/include/linux/mnt_namespace.h > @@ -36,6 +36,8 @@ extern const struct seq_operations mounts_op; > extern const struct seq_operations mountinfo_op; > extern const struct seq_operations mountstats_op; > extern int mnt_had_events(struct proc_mounts *); > +extern struct vfsmount *fs_get_vfsmount(struct task_struct *task, > + struct super_block *sb); > > #endif > #endif > diff --git a/include/linux/namei.h b/include/linux/namei.h > index 05b441d..a853aa0 100644 > --- a/include/linux/namei.h > +++ b/include/linux/namei.h > @@ -4,6 +4,7 @@ > #include <linux/dcache.h> > #include <linux/linkage.h> > #include <linux/path.h> > +#include <asm-generic/fcntl.h> > > struct vfsmount; > > @@ -96,4 +97,27 @@ static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) > ((char *) name)[min(len, maxlen)] = '\0'; > } > > +/* > + * Note that while the flag value (low two bits) for sys_open means: > + * 00 - read-only > + * 01 - write-only > + * 10 - read-write > + * 11 - special > + * it is changed into > + * 00 - no permissions needed > + * 01 - read-permission > + * 10 - write-permission > + * 11 - read-write > + * for the internal routines (ie open_namei()/follow_link() etc) > + * This is more logical, and also allows the 00 "no perm needed" > + * to be used for symlinks (where the permissions are checked > + * later). > + * > +*/ > +static inline int open_to_namei_flags(int flag) > +{ > + if ((flag+1) & O_ACCMODE) > + flag++; > + return flag; > +} > #endif /* _LINUX_NAMEI_H */ -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html