Re: [PATCH 21/38] union-mount: Support for mounting union mount file systems

Ian Kent <raven@xxxxxxxxxx> · Tue, 13 Jul 2010 12:47:02 +0800

On Tue, Jun 15, 2010 at 11:39:51AM -0700, Valerie Aurora wrote:
> Create and tear down union mount structures on mount.  Check
> requirements for union mounts.  This version clones the read-only
> mounts and puts them in an array hanging off the superblock of the
> topmost layer.
> 
> XXX - need array? maybe use mnt_child or mnt_hash instead
> 
> Thanks to Felix Fietkau <nbd@xxxxxxxxxxx> for a bug fix.
> ---
>  fs/namespace.c        |  231 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  fs/super.c            |    1 +
>  include/linux/fs.h    |    3 +
>  include/linux/mount.h |    2 +
>  4 files changed, 235 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 7a399ba..9f3884c 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -33,6 +33,7 @@
>  #include <asm/unistd.h>
>  #include "pnode.h"
>  #include "internal.h"
> +#include "union.h"
>  
>  #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
>  #define HASH_SIZE (1UL << HASH_SHIFT)
> @@ -1049,6 +1050,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
>  		propagate_umount(kill);
>  
>  	list_for_each_entry(p, kill, mnt_hash) {
> +		d_free_unions(p->mnt_root);
>  		list_del_init(&p->mnt_expire);
>  		list_del_init(&p->mnt_list);
>  		__touch_mnt_namespace(p->mnt_ns);
> @@ -1334,6 +1336,193 @@ static int invent_group_ids(struct vfsmount *mnt, bool recurse)
>  	return 0;
>  }
>  
> +/**
> + * check_mnt_union - mount-time checks for union mount
> + *
> + * @mntpnt: path of the mountpoint the new mount will be on
> + * @topmost_mnt: vfsmount of the new file system to be mounted
> + * @mnt_flags: mount flags for the new file system
> + *
> + * Mount-time check of upper and lower layer file systems to see if we
> + * can union mount one on the other.
> + *
> + * The rules:
> + *
> + * Lower layer(s) read-only: We can't deal with namespace changes in
> + * the lower layers of a union, so the lower layer must be read-only.
> + * Note that we could possibly convert a read-write unioned mount into
> + * a read-only mount here, which would give us a way to union more
> + * than one layer with separate mount commands.
> + *
> + * Union only at roots of file systems: Only permit unioning of file
> + * systems at their root directories.  This allows us to mark entire
> + * mounts as unioned.  Otherwise we must slowly and expensively work
> + * our way up a path looking for a unioned directory before we know if
> + * a path is from a unioned lower layer.
> + *
> + * No submounts.  We could potentially mount over several read-only
> + * submounts, it's just more code to write.
> + *
> + * Topmost layer must be writable to support our readdir()
> + * solution of copying up all lower level entries to the
> + * topmost layer.
> + *
> + * Topmost file system must support whiteouts and fallthrus.
> + *
> + * Topmost file system can't be mounted elsewhere. XXX implement some
> + * kind of marker in the superblock so subsequent mounts are not
> + * possible.
> + *
> + * Note on union mounts and mount event propagation: The lower
> + * layer(s) of a union mount must not have any changes to its
> + * namespace.  Therefore, it must not be part of any mount event
> + * propagation group - i.e., shared or slave.  MNT_SHARED and
> + * MNT_SLAVE are not set at mount, but in do_change_type(), which
> + * prevents setting these flags on file systems with read-only users,
> + * which includes the lower layer(s) of a union mount.
> + */
> +
> +static int
> +check_mnt_union(struct path *mntpnt, struct vfsmount *topmost_mnt, int mnt_flags)
> +{
> +	struct vfsmount *lower_mnt = mntpnt->mnt;
> +
> +	if (!(mnt_flags & MNT_UNION))
> +		return 0;
> +
> +#ifndef CONFIG_UNION_MOUNT
> +	return -EINVAL;
> +#endif
> +	if (!(lower_mnt->mnt_sb->s_flags & MS_RDONLY))
> +		return -EBUSY;
> +
> +	if (!list_empty(&lower_mnt->mnt_mounts))
> +		return -EBUSY;
> +
> +	if (!IS_ROOT(mntpnt->dentry))
> +		return -EINVAL;
> +
> +	if (mnt_flags & MNT_READONLY)
> +		return -EROFS;
> +
> +	if (!(topmost_mnt->mnt_sb->s_flags & MS_WHITEOUT))
> +		return -EINVAL;
> +

Is there a need to check fallthru, umm ... that probably doesn't
apply for the ROOT(), right?

> +	/* XXX top level mount should only be mounted once */
> +
> +	return 0;
> +}
> +
> +void put_union_sb(struct super_block *sb)
> +{
> +	struct vfsmount *mnt;
> +	int i;
> +
> +	if (sb->s_vfs_union_mnts) {
> +		for (i = 0; i < sb->s_vfs_union_count; i++) {
> +			mnt = sb->s_vfs_union_mnts[i];
> +			if (mnt) {
> +				dec_hard_readonly_users(mnt);
> +				mntput(mnt);
> +			}
> +		}
> +		kfree(sb->s_vfs_union_mnts);
> +	}
> +}
> +
> +static void cleanup_mnt_union(struct vfsmount *topmost_mnt)
> +{
> +	d_free_unions(topmost_mnt->mnt_root);
> +	put_union_sb(topmost_mnt->mnt_sb);
> +}
> +
> +/**
> + * prepare_mnt_union - do setup necessary for a union mount
> + *
> + * @topmost_mnt: vfsmount of topmost layer
> + * @mntpnt: path of requested mountpoint
> + *
> + * A union mount clones the underlying read-only mounts and keeps them
> + * in its own internal list of of vfsmounts, hanging off the
> + * superblock.  The first underlying mount (at @mntpnt) has passed
> + * check_mnt_union(), so we know we have at least one layer of union
> + * mount underneath this one.  We union every underlying file system
> + * that is mounted on the same mountpoint (well, pathname) and
> + * read-only.

Last sentence looks a bit odd, would this be better?

We union every underlying file system that is mounted read-only on the
same mountpoint (well, pathname).

> + *
> + * XXX - Maybe should take # of layers to go down as an argument. But
> + * how to pass this in through mount options? All solutions look ugly.
> + */
> +
> +static int prepare_mnt_union(struct vfsmount *topmost_mnt, struct path *mntpnt)
> +{
> +	struct vfsmount *mnt;
> +	struct super_block *sb = topmost_mnt->mnt_sb;
> +	struct union_dir **next_ud;
> +	struct path upper, lower, this_layer;
> +	int i;
> +	int err;
> +
> +	/* Count the mounts to be unioned. */
> +	BUG_ON(sb->s_vfs_union_count != 0);
> +	this_layer = *mntpnt;
> +	while(check_mnt_union(&this_layer, topmost_mnt, MNT_UNION) == 0) {
> +		sb->s_vfs_union_count++;
> +		/* Where is this layer mounted? See if we can union that. */
> +		this_layer.dentry = this_layer.mnt->mnt_mountpoint;
> +		this_layer.mnt = this_layer.mnt->mnt_parent;
> +	}
> +	BUG_ON(sb->s_vfs_union_count == 0);
> +
> +	/*
> +	 * Allocate an array of pointers to vfsmounts.  We use this in
> +	 * deactivate_super() to free the underlying mounts when the
> +	 * topmost layer of a union mount loses its last reference.
> +	 *
> +	 * XXX - can't we link through mnt_child or mnt_hash instead?
> +	 * Neither is in use when a vfsmount is dangling off a union
> +	 * mounted superblock and therefore not part of the vfsmount
> +	 * tree.
> +	 */
> +	err = -ENOMEM;
> +	sb->s_vfs_union_mnts = kzalloc(sb->s_vfs_union_count *
> +				       sizeof (*sb->s_vfs_union_mnts),
> +				       GFP_KERNEL);
> +	if (!sb->s_vfs_union_mnts)
> +		goto out;
> +
> +	/* Clone the mounts */
> +	mnt = mntpnt->mnt;
> +	for (i = 0; i < sb->s_vfs_union_count; i++) {
> +		sb->s_vfs_union_mnts[i] = clone_mnt(mnt, mnt->mnt_root, CL_PRIVATE);
> +		if (!sb->s_vfs_union_mnts[i])
> +			goto out;
> +		inc_hard_readonly_users(mnt);
> +		/* XXX set mountpoint or otherwise manipulate cloned mnt? */
> +		mnt = mnt->mnt_parent;
> +	}
> +
> +	/* Build the union stack for the root dir */
> +	upper.mnt = topmost_mnt;
> +	upper.dentry = topmost_mnt->mnt_root;
> +	next_ud = &topmost_mnt->mnt_root->d_union_dir;
> +	for (i = 0; i < sb->s_vfs_union_count; i++) {
> +		mnt = sb->s_vfs_union_mnts[i];
> +		lower.mnt = mntget(mnt);
> +		lower.dentry = dget(mnt->mnt_root);
> +		err = union_add_dir(&upper, &lower, next_ud);
> +		if (err)
> +			goto out;
> +		next_ud = &lower.dentry->d_union_dir;
> +		upper = lower;
> +	}
> +
> +	return 0;
> +out:
> +	cleanup_mnt_union(topmost_mnt);
> +	return err;
> +}
> +
>  /*
>   *  @source_mnt : mount tree to be attached
>   *  @nd         : place the mount tree @source_mnt is attached
> @@ -1411,9 +1600,16 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
>  		if (err)
>  			goto out;
>  	}
> +
> +	if (!parent_path && IS_MNT_UNION(source_mnt)) {
> +		err = prepare_mnt_union(source_mnt, path);
> +		if (err)
> +			goto out_cleanup_ids;
> +	}
> +
>  	err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
>  	if (err)
> -		goto out_cleanup_ids;
> +		goto out_cleanup_union;
>  
>  	spin_lock(&vfsmount_lock);
>  
> @@ -1437,6 +1633,9 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
>  	spin_unlock(&vfsmount_lock);
>  	return 0;
>  
> + out_cleanup_union:
> +	if (IS_MNT_UNION(source_mnt))
> +		cleanup_mnt_union(source_mnt);
>   out_cleanup_ids:
>  	if (IS_MNT_SHARED(dest_mnt))
>  		cleanup_group_ids(source_mnt, NULL);
> @@ -1490,6 +1689,17 @@ static int do_change_type(struct path *path, int flag)
>  		return -EINVAL;
>  
>  	down_write(&namespace_sem);
> +
> +	/*
> +	 * Mounts of file systems with read-only users can't deal with
> +	 * mount/umount propagation events - it's the moral equivalent
> +	 * of rm -rf dir/ or the like.
> +	 */
> +	if (sb_is_hard_readonly(mnt->mnt_sb)) {
> +		err = -EROFS;
> +		goto out_unlock;
> +	}
> +
>  	if (type == MS_SHARED) {
>  		err = invent_group_ids(mnt, recurse);
>  		if (err)
> @@ -1527,6 +1737,9 @@ static int do_loopback(struct path *path, char *old_name,
>  	err = -EINVAL;
>  	if (IS_MNT_UNBINDABLE(old_path.mnt))
>  		goto out;
> +	/* Mount part of a union mount elsewhere? The mind boggles. */
> +	if (IS_MNT_UNION(old_path.mnt))
> +		goto out;
>  
>  	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
>  		goto out;
> @@ -1548,7 +1761,6 @@ static int do_loopback(struct path *path, char *old_name,
>  		spin_unlock(&vfsmount_lock);
>  		release_mounts(&umount_list);
>  	}
> -
>  out:
>  	up_write(&namespace_sem);
>  	path_put(&old_path);
> @@ -1589,6 +1801,17 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
>  	if (!check_mnt(path->mnt))
>  		return -EINVAL;
>  
> +	if (mnt_flags & MNT_UNION)
> +		return -EINVAL;
> +
> +	if ((path->mnt->mnt_flags & MNT_UNION) &&
> +	    !(mnt_flags & MNT_UNION))
> +		return -EINVAL;
> +
> +	if ((path->mnt->mnt_flags & MNT_UNION) &&
> +	    (mnt_flags & MNT_READONLY))
> +		return -EINVAL;
> +
>  	if (path->dentry != path->mnt->mnt_root)
>  		return -EINVAL;
>  
> @@ -1753,6 +1976,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
>  	if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
>  		goto unlock;
>  
> +	err = check_mnt_union(path, newmnt, mnt_flags);
> +	if (err)
> +		goto unlock;
> +
>  	newmnt->mnt_flags = mnt_flags;
>  	if ((err = graft_tree(newmnt, path)))
>  		goto unlock;
> diff --git a/fs/super.c b/fs/super.c
> index 6add39b..2ade113 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -197,6 +197,7 @@ void deactivate_super(struct super_block *s)
>  		down_write(&s->s_umount);
>  		fs->kill_sb(s);
>  		put_filesystem(fs);
> +		put_union_sb(s);
>  		put_super(s);
>  	}
>  }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 32e6988..cc2934d 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1396,6 +1396,9 @@ struct super_block {
>  	 */
>  	int s_hard_readonly_users;
>  
> +	/* Array of vfsmounts that are part of this union mount */
> +	struct vfsmount **s_vfs_union_mnts;
> +	int s_vfs_union_count;
>  };
>  
>  extern struct timespec current_fs_time(struct super_block *sb);
> diff --git a/include/linux/mount.h b/include/linux/mount.h
> index 0302703..17d3d27 100644
> --- a/include/linux/mount.h
> +++ b/include/linux/mount.h
> @@ -136,4 +136,6 @@ extern void mark_mounts_for_expiry(struct list_head *mounts);
>  
>  extern dev_t name_to_dev_t(char *name);
>  
> +extern void put_union_sb(struct super_block *sb);
> +
>  #endif /* _LINUX_MOUNT_H */
> -- 
> 1.6.3.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html