On Tue, Jun 15, 2010 at 11:39:51AM -0700, Valerie Aurora wrote: > Create and tear down union mount structures on mount. Check > requirements for union mounts. This version clones the read-only > mounts and puts them in an array hanging off the superblock of the > topmost layer. > > XXX - need array? maybe use mnt_child or mnt_hash instead > > Thanks to Felix Fietkau <nbd@xxxxxxxxxxx> for a bug fix. > --- > fs/namespace.c | 231 ++++++++++++++++++++++++++++++++++++++++++++++++- > fs/super.c | 1 + > include/linux/fs.h | 3 + > include/linux/mount.h | 2 + > 4 files changed, 235 insertions(+), 2 deletions(-) > > diff --git a/fs/namespace.c b/fs/namespace.c > index 7a399ba..9f3884c 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -33,6 +33,7 @@ > #include <asm/unistd.h> > #include "pnode.h" > #include "internal.h" > +#include "union.h" > > #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) > #define HASH_SIZE (1UL << HASH_SHIFT) > @@ -1049,6 +1050,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) > propagate_umount(kill); > > list_for_each_entry(p, kill, mnt_hash) { > + d_free_unions(p->mnt_root); > list_del_init(&p->mnt_expire); > list_del_init(&p->mnt_list); > __touch_mnt_namespace(p->mnt_ns); > @@ -1334,6 +1336,193 @@ static int invent_group_ids(struct vfsmount *mnt, bool recurse) > return 0; > } > > +/** > + * check_mnt_union - mount-time checks for union mount > + * > + * @mntpnt: path of the mountpoint the new mount will be on > + * @topmost_mnt: vfsmount of the new file system to be mounted > + * @mnt_flags: mount flags for the new file system > + * > + * Mount-time check of upper and lower layer file systems to see if we > + * can union mount one on the other. > + * > + * The rules: > + * > + * Lower layer(s) read-only: We can't deal with namespace changes in > + * the lower layers of a union, so the lower layer must be read-only. > + * Note that we could possibly convert a read-write unioned mount into > + * a read-only mount here, which would give us a way to union more > + * than one layer with separate mount commands. > + * > + * Union only at roots of file systems: Only permit unioning of file > + * systems at their root directories. This allows us to mark entire > + * mounts as unioned. Otherwise we must slowly and expensively work > + * our way up a path looking for a unioned directory before we know if > + * a path is from a unioned lower layer. > + * > + * No submounts. We could potentially mount over several read-only > + * submounts, it's just more code to write. > + * > + * Topmost layer must be writable to support our readdir() > + * solution of copying up all lower level entries to the > + * topmost layer. > + * > + * Topmost file system must support whiteouts and fallthrus. > + * > + * Topmost file system can't be mounted elsewhere. XXX implement some > + * kind of marker in the superblock so subsequent mounts are not > + * possible. > + * > + * Note on union mounts and mount event propagation: The lower > + * layer(s) of a union mount must not have any changes to its > + * namespace. Therefore, it must not be part of any mount event > + * propagation group - i.e., shared or slave. MNT_SHARED and > + * MNT_SLAVE are not set at mount, but in do_change_type(), which > + * prevents setting these flags on file systems with read-only users, > + * which includes the lower layer(s) of a union mount. > + */ > + > +static int > +check_mnt_union(struct path *mntpnt, struct vfsmount *topmost_mnt, int mnt_flags) > +{ > + struct vfsmount *lower_mnt = mntpnt->mnt; > + > + if (!(mnt_flags & MNT_UNION)) > + return 0; > + > +#ifndef CONFIG_UNION_MOUNT > + return -EINVAL; > +#endif > + if (!(lower_mnt->mnt_sb->s_flags & MS_RDONLY)) > + return -EBUSY; > + > + if (!list_empty(&lower_mnt->mnt_mounts)) > + return -EBUSY; > + > + if (!IS_ROOT(mntpnt->dentry)) > + return -EINVAL; > + > + if (mnt_flags & MNT_READONLY) > + return -EROFS; > + > + if (!(topmost_mnt->mnt_sb->s_flags & MS_WHITEOUT)) > + return -EINVAL; > + Is there a need to check fallthru, umm ... that probably doesn't apply for the ROOT(), right? > + /* XXX top level mount should only be mounted once */ > + > + return 0; > +} > + > +void put_union_sb(struct super_block *sb) > +{ > + struct vfsmount *mnt; > + int i; > + > + if (sb->s_vfs_union_mnts) { > + for (i = 0; i < sb->s_vfs_union_count; i++) { > + mnt = sb->s_vfs_union_mnts[i]; > + if (mnt) { > + dec_hard_readonly_users(mnt); > + mntput(mnt); > + } > + } > + kfree(sb->s_vfs_union_mnts); > + } > +} > + > +static void cleanup_mnt_union(struct vfsmount *topmost_mnt) > +{ > + d_free_unions(topmost_mnt->mnt_root); > + put_union_sb(topmost_mnt->mnt_sb); > +} > + > +/** > + * prepare_mnt_union - do setup necessary for a union mount > + * > + * @topmost_mnt: vfsmount of topmost layer > + * @mntpnt: path of requested mountpoint > + * > + * A union mount clones the underlying read-only mounts and keeps them > + * in its own internal list of of vfsmounts, hanging off the > + * superblock. The first underlying mount (at @mntpnt) has passed > + * check_mnt_union(), so we know we have at least one layer of union > + * mount underneath this one. We union every underlying file system > + * that is mounted on the same mountpoint (well, pathname) and > + * read-only. Last sentence looks a bit odd, would this be better? We union every underlying file system that is mounted read-only on the same mountpoint (well, pathname). > + * > + * XXX - Maybe should take # of layers to go down as an argument. But > + * how to pass this in through mount options? All solutions look ugly. > + */ > + > +static int prepare_mnt_union(struct vfsmount *topmost_mnt, struct path *mntpnt) > +{ > + struct vfsmount *mnt; > + struct super_block *sb = topmost_mnt->mnt_sb; > + struct union_dir **next_ud; > + struct path upper, lower, this_layer; > + int i; > + int err; > + > + /* Count the mounts to be unioned. */ > + BUG_ON(sb->s_vfs_union_count != 0); > + this_layer = *mntpnt; > + while(check_mnt_union(&this_layer, topmost_mnt, MNT_UNION) == 0) { > + sb->s_vfs_union_count++; > + /* Where is this layer mounted? See if we can union that. */ > + this_layer.dentry = this_layer.mnt->mnt_mountpoint; > + this_layer.mnt = this_layer.mnt->mnt_parent; > + } > + BUG_ON(sb->s_vfs_union_count == 0); > + > + /* > + * Allocate an array of pointers to vfsmounts. We use this in > + * deactivate_super() to free the underlying mounts when the > + * topmost layer of a union mount loses its last reference. > + * > + * XXX - can't we link through mnt_child or mnt_hash instead? > + * Neither is in use when a vfsmount is dangling off a union > + * mounted superblock and therefore not part of the vfsmount > + * tree. > + */ > + err = -ENOMEM; > + sb->s_vfs_union_mnts = kzalloc(sb->s_vfs_union_count * > + sizeof (*sb->s_vfs_union_mnts), > + GFP_KERNEL); > + if (!sb->s_vfs_union_mnts) > + goto out; > + > + /* Clone the mounts */ > + mnt = mntpnt->mnt; > + for (i = 0; i < sb->s_vfs_union_count; i++) { > + sb->s_vfs_union_mnts[i] = clone_mnt(mnt, mnt->mnt_root, CL_PRIVATE); > + if (!sb->s_vfs_union_mnts[i]) > + goto out; > + inc_hard_readonly_users(mnt); > + /* XXX set mountpoint or otherwise manipulate cloned mnt? */ > + mnt = mnt->mnt_parent; > + } > + > + /* Build the union stack for the root dir */ > + upper.mnt = topmost_mnt; > + upper.dentry = topmost_mnt->mnt_root; > + next_ud = &topmost_mnt->mnt_root->d_union_dir; > + for (i = 0; i < sb->s_vfs_union_count; i++) { > + mnt = sb->s_vfs_union_mnts[i]; > + lower.mnt = mntget(mnt); > + lower.dentry = dget(mnt->mnt_root); > + err = union_add_dir(&upper, &lower, next_ud); > + if (err) > + goto out; > + next_ud = &lower.dentry->d_union_dir; > + upper = lower; > + } > + > + return 0; > +out: > + cleanup_mnt_union(topmost_mnt); > + return err; > +} > + > /* > * @source_mnt : mount tree to be attached > * @nd : place the mount tree @source_mnt is attached > @@ -1411,9 +1600,16 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, > if (err) > goto out; > } > + > + if (!parent_path && IS_MNT_UNION(source_mnt)) { > + err = prepare_mnt_union(source_mnt, path); > + if (err) > + goto out_cleanup_ids; > + } > + > err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); > if (err) > - goto out_cleanup_ids; > + goto out_cleanup_union; > > spin_lock(&vfsmount_lock); > > @@ -1437,6 +1633,9 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, > spin_unlock(&vfsmount_lock); > return 0; > > + out_cleanup_union: > + if (IS_MNT_UNION(source_mnt)) > + cleanup_mnt_union(source_mnt); > out_cleanup_ids: > if (IS_MNT_SHARED(dest_mnt)) > cleanup_group_ids(source_mnt, NULL); > @@ -1490,6 +1689,17 @@ static int do_change_type(struct path *path, int flag) > return -EINVAL; > > down_write(&namespace_sem); > + > + /* > + * Mounts of file systems with read-only users can't deal with > + * mount/umount propagation events - it's the moral equivalent > + * of rm -rf dir/ or the like. > + */ > + if (sb_is_hard_readonly(mnt->mnt_sb)) { > + err = -EROFS; > + goto out_unlock; > + } > + > if (type == MS_SHARED) { > err = invent_group_ids(mnt, recurse); > if (err) > @@ -1527,6 +1737,9 @@ static int do_loopback(struct path *path, char *old_name, > err = -EINVAL; > if (IS_MNT_UNBINDABLE(old_path.mnt)) > goto out; > + /* Mount part of a union mount elsewhere? The mind boggles. */ > + if (IS_MNT_UNION(old_path.mnt)) > + goto out; > > if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) > goto out; > @@ -1548,7 +1761,6 @@ static int do_loopback(struct path *path, char *old_name, > spin_unlock(&vfsmount_lock); > release_mounts(&umount_list); > } > - > out: > up_write(&namespace_sem); > path_put(&old_path); > @@ -1589,6 +1801,17 @@ static int do_remount(struct path *path, int flags, int mnt_flags, > if (!check_mnt(path->mnt)) > return -EINVAL; > > + if (mnt_flags & MNT_UNION) > + return -EINVAL; > + > + if ((path->mnt->mnt_flags & MNT_UNION) && > + !(mnt_flags & MNT_UNION)) > + return -EINVAL; > + > + if ((path->mnt->mnt_flags & MNT_UNION) && > + (mnt_flags & MNT_READONLY)) > + return -EINVAL; > + > if (path->dentry != path->mnt->mnt_root) > return -EINVAL; > > @@ -1753,6 +1976,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, > if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) > goto unlock; > > + err = check_mnt_union(path, newmnt, mnt_flags); > + if (err) > + goto unlock; > + > newmnt->mnt_flags = mnt_flags; > if ((err = graft_tree(newmnt, path))) > goto unlock; > diff --git a/fs/super.c b/fs/super.c > index 6add39b..2ade113 100644 > --- a/fs/super.c > +++ b/fs/super.c > @@ -197,6 +197,7 @@ void deactivate_super(struct super_block *s) > down_write(&s->s_umount); > fs->kill_sb(s); > put_filesystem(fs); > + put_union_sb(s); > put_super(s); > } > } > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 32e6988..cc2934d 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1396,6 +1396,9 @@ struct super_block { > */ > int s_hard_readonly_users; > > + /* Array of vfsmounts that are part of this union mount */ > + struct vfsmount **s_vfs_union_mnts; > + int s_vfs_union_count; > }; > > extern struct timespec current_fs_time(struct super_block *sb); > diff --git a/include/linux/mount.h b/include/linux/mount.h > index 0302703..17d3d27 100644 > --- a/include/linux/mount.h > +++ b/include/linux/mount.h > @@ -136,4 +136,6 @@ extern void mark_mounts_for_expiry(struct list_head *mounts); > > extern dev_t name_to_dev_t(char *name); > > +extern void put_union_sb(struct super_block *sb); > + > #endif /* _LINUX_MOUNT_H */ > -- > 1.6.3.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html