On Tue, 27 Apr 2010 11:05:45 +1000, Neil Brown <neilb@xxxxxxx> wrote: > On Mon, 26 Apr 2010 23:04:01 +0530 > "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxxxxxxx> wrote: > > > Acked-by: Serge Hallyn <serue@xxxxxxxxxx> > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx> > > --- > > fs/filesystems.c | 32 +++++++++- > > fs/namei.c | 24 ------- > > fs/namespace.c | 38 +++++++++++ > > fs/open.c | 148 +++++++++++++++++++++++++++++++++++++++++ > > fs/pnode.c | 2 +- > > include/linux/mnt_namespace.h | 2 + > > include/linux/namei.h | 24 +++++++ > > 7 files changed, 244 insertions(+), 26 deletions(-) > > > > diff --git a/fs/filesystems.c b/fs/filesystems.c > > index 68ba492..743d36e 100644 > > --- a/fs/filesystems.c > > +++ b/fs/filesystems.c > > @@ -281,5 +281,35 @@ struct file_system_type *get_fs_type(const char *name) > > } > > return fs; > > } > > - > > EXPORT_SYMBOL(get_fs_type); > > + > > +struct super_block *fs_get_sb(struct uuid *fsid) > > +{ > > + struct uuid *this_fsid; > > + struct file_system_type *fs_type; > > + struct super_block *sb, *found_sb = NULL; > > + > > + read_lock(&file_systems_lock); > > + fs_type = file_systems; > > + while (fs_type) { > > You've open-coded a for-loop here... > Why not: > for (fs_type = file_systems ; fs_type ; fs_type = fs_type->next) { > ?? Fixed > > > > + spin_lock(&sb_lock); > > + list_for_each_entry(sb, &fs_type->fs_supers, s_instances) { > > + if (!sb->s_op->get_fsid) > > + continue; > > + this_fsid = sb->s_op->get_fsid(sb); > > + if (!memcmp(fsid->uuid, this_fsid->uuid, > > + sizeof(this_fsid->uuid))) { > > + /* found the matching super_block */ > > + atomic_inc(&sb->s_active); > > + found_sb = sb; > > + spin_unlock(&sb_lock); > > + goto out; > > + } > > + } > > + spin_unlock(&sb_lock); > > + fs_type = fs_type->next; > > + } > > +out: > > + read_unlock(&file_systems_lock); > > + return found_sb; > > +} > > diff --git a/fs/namei.c b/fs/namei.c > > index a7dce91..a18711e 100644 > > --- a/fs/namei.c > > +++ b/fs/namei.c > > @@ -1521,30 +1521,6 @@ out_unlock: > > return may_open(&nd->path, 0, open_flag & ~O_TRUNC); > > } > > > > -/* > > - * Note that while the flag value (low two bits) for sys_open means: > > - * 00 - read-only > > - * 01 - write-only > > - * 10 - read-write > > - * 11 - special > > - * it is changed into > > - * 00 - no permissions needed > > - * 01 - read-permission > > - * 10 - write-permission > > - * 11 - read-write > > - * for the internal routines (ie open_namei()/follow_link() etc) > > - * This is more logical, and also allows the 00 "no perm needed" > > - * to be used for symlinks (where the permissions are checked > > - * later). > > - * > > -*/ > > -static inline int open_to_namei_flags(int flag) > > -{ > > - if ((flag+1) & O_ACCMODE) > > - flag++; > > - return flag; > > -} > > - > > static int open_will_truncate(int flag, struct inode *inode) > > { > > /* > > diff --git a/fs/namespace.c b/fs/namespace.c > > index 8174c8a..6168526 100644 > > --- a/fs/namespace.c > > +++ b/fs/namespace.c > > @@ -2364,3 +2364,41 @@ void put_mnt_ns(struct mnt_namespace *ns) > > kfree(ns); > > } > > EXPORT_SYMBOL(put_mnt_ns); > > + > > +/* > > + * Get any vfsmount mapping the superblock in the > > + * task namespace > > + */ > > +struct vfsmount *fs_get_vfsmount(struct task_struct *task, > > + struct super_block *sb) > > +{ > > + struct nsproxy *nsp; > > + struct list_head *mount_list; > > + struct mnt_namespace *ns = NULL; > > + struct vfsmount *mnt, *sb_mnt = NULL; > > + > > + rcu_read_lock(); > > + nsp = task_nsproxy(task); > > + if (nsp) { > > + ns = nsp->mnt_ns; > > + if (ns) > > + get_mnt_ns(ns); > > + } > > + rcu_read_unlock(); > > + if (!ns) > > + return NULL; > > + down_read(&namespace_sem); > > + list_for_each(mount_list, &ns->list) { > > + mnt = list_entry(mount_list, struct vfsmount, mnt_list); > > + if (mnt->mnt_sb == sb) { > > + /* found the matching super block */ > > + sb_mnt = mnt; > > + mntget(sb_mnt); > > + break; > > + } > > + } > > + up_read(&namespace_sem); > > + > > + put_mnt_ns(ns); > > + return sb_mnt; > > +} > > If a task has the same filesystem mounted several times in it's namespace > (via "mount --bind") you just return any arbitrary one (the first). > > Suppose that the filesystem does appear twice, and that in one appearance a > particular directory is mounted on, while in another appearance that > directory is not mounted on. Suppose that the mounted-on appearance is > listed first in ns->list > > Now we use a series of "name_to_handle" / "open_by_handle" calls to descend > through the second appearance to something beneath that directory which (in > the first appearance) is mounted-on. > This will not work as you will actually be descending the first appearance of > the filesystem. > I really think you should leave the choice of filesystem/mountpoint to > user-space. > Will comment on this on a separate thread. > > > diff --git a/fs/open.c b/fs/open.c > > index e9aae5c..b87fa32 100644 > > --- a/fs/open.c > > +++ b/fs/open.c > > @@ -1306,3 +1306,151 @@ SYSCALL_DEFINE2(lname_to_handle, const char __user *, name, > > asmlinkage_protect(2, ret, name, handle); > > return ret; > > } > > + > > +static int vfs_dentry_acceptable(void *context, struct dentry *dentry) > > +{ > > + return 1; > > +} > > + > > +static struct dentry *handle_to_dentry(struct vfsmount *mnt, > > + struct file_handle *handle) > > +{ > > + int handle_size; > > + struct dentry *dentry; > > + > > + /* change the handle size to multiple of sizeof(u32) */ > > + handle_size = handle->handle_size >> 2; > > + dentry = exportfs_decode_fh(mnt, (struct fid *)handle->f_handle, > > + handle_size, handle->handle_type, > > + vfs_dentry_acceptable, NULL); > > + return dentry; > > +} > > + > > +long do_sys_open_by_handle(struct file_handle __user *ufh, int flags) > > +{ > > + int fd; > > + int retval = 0; > > + int d_flags = flags; > > + struct file *filp; > > + struct vfsmount *mnt; > > + struct inode *inode; > > + struct dentry *dentry; > > + struct super_block *sb; > > + struct file_handle f_handle; > > + struct file_handle *handle = NULL; > > + > > + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) > > + return -EFAULT; > > + > > + if ((f_handle.handle_size > MAX_HANDLE_SZ) || > > + (f_handle.handle_size <= 0)) > > + return -EINVAL; > > + > > + if (!capable(CAP_DAC_OVERRIDE)) > > + return -EPERM; > > + > > + sb = fs_get_sb(&f_handle.fsid); > > + if (!sb) > > + return -ESTALE; > > + /* > > + * Find the vfsmount for this superblock in the > > + * current namespace > > + */ > > + mnt = fs_get_vfsmount(current, sb); > > + if (!mnt) { > > + retval = -ESTALE; > > + goto out_sb; > > + } > > + > > + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_size, > > + GFP_KERNEL); > > + if (!handle) { > > + retval = -ENOMEM; > > + goto out_mnt; > > + } > > + /* copy the full handle */ > > + if (copy_from_user(handle, ufh, > > + sizeof(struct file_handle) + > > + f_handle.handle_size)) { > > + retval = -EFAULT; > > + goto out_mnt; > > + } > > + > > + dentry = handle_to_dentry(mnt, handle); > > + if (IS_ERR(dentry)) { > > + retval = PTR_ERR(dentry); > > + goto out_mnt; > > + } > > + > > + inode = dentry->d_inode; > > + > > + flags = open_to_namei_flags(flags); > > + /* O_TRUNC implies we need access checks for write permissions */ > > + if (flags & O_TRUNC) > > + flags |= MAY_WRITE; > > + > > + if ((!(flags & O_APPEND) || (flags & O_TRUNC)) && > > + (flags & FMODE_WRITE) && IS_APPEND(inode)) { > > + retval = -EPERM; > > + goto out_err; > > + } > > + > > + if ((flags & FMODE_WRITE) && IS_IMMUTABLE(inode)) { > > + retval = -EACCES; > > + goto out_err; > > + } > > + > > + /* Can't write directories. */ > > + if (S_ISDIR(inode->i_mode) && (flags & FMODE_WRITE)) { > > + retval = -EISDIR; > > + goto out_err; > > + } > > + > > + fd = get_unused_fd(); > > You want to use alloc_fd() (or get_unused_fd_flags) rather than get_unused_fd > so that the flags can be passed though and, in particular, so O_CLOEXEC will > be honoured. Fixed by saying fd = get_unused_fd_flags(d_flags); > > > > + if (fd < 0) { > > + retval = fd; > > + goto out_err; > > + } > > + > > + filp = dentry_open(dentry, mntget(mnt), > > + d_flags, current_cred()); > > dentry_open consumed the reference to dentry, so ... > > > + if (IS_ERR(filp)) { > > + put_unused_fd(fd); > > + retval = PTR_ERR(filp); > > + goto out_err; > > The dput at 'out_err' will put it one time too many. > I think the best fix would be to pass dget(dentry) to dentry_open, then .... > > > + } > > + > > + if (inode->i_mode & S_IFREG) { > > + filp->f_flags |= O_NOATIME; > > + filp->f_mode |= FMODE_NOCMTIME; > > + } > > + fsnotify_open(filp->f_path.dentry); > > + fd_install(fd, filp); > > + retval = fd; > > + goto out_mnt; > > ... this goto (Which is rather ugly to me) would not be needed. > Fixed Thanks for the review -aneesh -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html