From: Bharata B Rao <bharata@xxxxxxxxxxxxxxxxxx> Subject: Mount changes to support union mount. Adds union mount support. This patch adds a new mount type for union mount (MNT_UNION) and changes the mount path to build a union stack during mount. The routines for supporting the creation, traversal and destruction of union stacks are also included here. Signed-off-by: Bharata B Rao <bharata@xxxxxxxxxxxxxxxxxx> --- fs/namespace.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++---- include/linux/fs.h | 1 include/linux/mount.h | 17 +++++ 3 files changed, 172 insertions(+), 10 deletions(-) --- a/fs/namespace.c +++ b/fs/namespace.c @@ -35,6 +35,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLO static int event; static struct list_head *mount_hashtable __read_mostly; +static struct list_head *union_mount_hashtable; static int hash_mask __read_mostly, hash_bits __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; static struct rw_semaphore namespace_sem; @@ -54,6 +55,89 @@ static inline unsigned long hash(struct return tmp & hash_mask; } +/* Must be called with vfsmount_lock held */ +static struct union_mount *find_union_mount(struct vfsmount *mnt, + struct dentry *dentry) +{ + struct list_head *head; + struct union_mount *u; + + if (!IS_MNT_UNION(mnt)) + return NULL; + + head = union_mount_hashtable + hash(mnt, dentry); + list_for_each_entry(u, head, hash) + if (u->src_mnt == mnt && u->src_dentry == dentry) + return u; + return NULL; +} + +/* + * When propagating mount events to peer group, this is called under + * vfsmount_lock. Hence using GFP_ATOMIC for kmalloc here. + * TODO: Can we use a separate kmem cache for union_mount ? + */ +struct union_mount *alloc_union_mount(struct vfsmount *src_mnt, + struct dentry *src_dentry, struct vfsmount *dst_mnt, + struct dentry *dst_dentry) +{ + struct union_mount *u; + u = kmalloc(sizeof(struct union_mount), GFP_ATOMIC); + if (!u) + return u; + u->dst_mnt = mntget(dst_mnt); + u->dst_dentry = dget(dst_dentry); + u->src_mnt = src_mnt; + u->src_dentry = dget(src_dentry); + INIT_LIST_HEAD(&u->hash); + INIT_LIST_HEAD(&u->list); + return u; +} + +/* Must be called with vfsmount_lock held */ +void attach_mnt_union(struct union_mount *u) +{ + if (!u) + return; + + list_add_tail(&u->hash, union_mount_hashtable + + hash(u->src_mnt, u->src_dentry)); + list_add_tail(&u->list, &u->src_mnt->mnt_union); +} + +/* + * Finds the next (vfsmount, dentry) in the union stack. If found, returns + * it via @nd and returns true. Else doesn't modify @nd, but returns false. + */ +int next_union_mount(struct nameidata *nd) +{ + struct union_mount *u; + + spin_lock(&vfsmount_lock); + u = find_union_mount(nd->mnt, nd->dentry); + spin_unlock(&vfsmount_lock); + if (u) { + nd->mnt = u->dst_mnt; + nd->dentry = u->dst_dentry; + return 1; + } + return 0; +} + +/* Check if next element of the union stack exists. @nd isn't modified. */ +int next_union_mount_exists(struct vfsmount *mnt, struct dentry *dentry) +{ + struct union_mount *u; + + spin_lock(&vfsmount_lock); + u = find_union_mount(mnt, dentry); + spin_unlock(&vfsmount_lock); + if (u) + return 1; + else + return 0; +} + struct vfsmount *alloc_vfsmnt(const char *name) { struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -67,6 +151,7 @@ struct vfsmount *alloc_vfsmnt(const char INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_LIST_HEAD(&mnt->mnt_union); if (name) { int size = strlen(name) + 1; char *newname = kmalloc(size, GFP_KERNEL); @@ -173,18 +258,20 @@ void mnt_set_mountpoint(struct vfsmount dentry->d_mounted++; } -static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd) +static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd, + struct union_mount *u) { mnt_set_mountpoint(nd->mnt, nd->dentry, mnt); list_add_tail(&mnt->mnt_hash, mount_hashtable + hash(nd->mnt, nd->dentry)); list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts); + attach_mnt_union(u); } /* * the caller must hold vfsmount_lock */ -static void commit_tree(struct vfsmount *mnt) +static void commit_tree(struct vfsmount *mnt, struct union_mount *u) { struct vfsmount *parent = mnt->mnt_parent; struct vfsmount *m; @@ -201,6 +288,7 @@ static void commit_tree(struct vfsmount list_add_tail(&mnt->mnt_hash, mount_hashtable + hash(parent, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + attach_mnt_union(u); touch_mnt_namespace(n); } @@ -342,8 +430,18 @@ static struct vfsmount *clone_mnt(struct static inline void __mntput(struct vfsmount *mnt) { struct super_block *sb = mnt->mnt_sb; + struct union_mount *u, *next; + dput(mnt->mnt_root); clear_mnt_user(mnt); + + list_for_each_entry_safe(u, next, &mnt->mnt_union, list) { + list_del_init(&u->list); + dput(u->src_dentry); + mntput(u->dst_mnt); + dput(u->dst_dentry); + kfree(u); + } free_vfsmnt(mnt); deactivate_super(sb); } @@ -352,6 +450,17 @@ void mntput_no_expire(struct vfsmount *m { repeat: if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { + struct union_mount *u; + + /* + * Remove all union_mounts under this mnt from the + * union_mount_hashtable. This needs to be be done with + * vfsmount_lock held. The rest of the cleanup is done + * outside of the lock. + */ + list_for_each_entry(u, &mnt->mnt_union, list) + list_del_init(&u->hash); + if (likely(!mnt->mnt_pinned)) { spin_unlock(&vfsmount_lock); __mntput(mnt); @@ -436,6 +545,7 @@ static int show_vfsmnt(struct seq_file * { MNT_NODIRATIME, ",nodiratime" }, { MNT_RELATIME, ",relatime" }, { MNT_NOMNT, ",nomnt" }, + { MNT_UNION, ",union" }, { 0, NULL } }; struct proc_fs_info *fs_infop; @@ -839,7 +949,11 @@ struct vfsmount *copy_tree(struct vfsmou goto error; spin_lock(&vfsmount_lock); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, &nd); + /* + * TODO: Understand and pass appropriate union_mount + * argument here. + */ + attach_mnt(q, &nd, NULL); spin_unlock(&vfsmount_lock); } } @@ -925,10 +1039,16 @@ static int attach_recursive_mnt(struct v struct vfsmount *dest_mnt = nd->mnt; struct dentry *dest_dentry = nd->dentry; struct vfsmount *child, *p; + struct union_mount *u = NULL; if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list)) return -EINVAL; + if (IS_MNT_UNION(source_mnt)) + if (!(u = alloc_union_mount(source_mnt, source_mnt->mnt_root, + dest_mnt, dest_dentry))) + return -ENOMEM; + if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); @@ -937,18 +1057,26 @@ static int attach_recursive_mnt(struct v spin_lock(&vfsmount_lock); if (parent_nd) { detach_mnt(source_mnt, parent_nd); - attach_mnt(source_mnt, nd); + attach_mnt(source_mnt, nd, u); touch_mnt_namespace(current->nsproxy->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); - commit_tree(source_mnt); + commit_tree(source_mnt, u); } list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { list_del_init(&child->mnt_hash); - commit_tree(child); + if (IS_MNT_UNION(child)) { + u = alloc_union_mount(child, child->mnt_root, + child->mnt_parent, child->mnt_mountpoint); + /* FIXME: It is too late to fail from here */ + if (!u) + printk(KERN_ERR "attach_recursive_mnt: ENOMEM\n"); + } + commit_tree(child, u); } spin_unlock(&vfsmount_lock); + return 0; } @@ -1556,9 +1684,12 @@ long do_mount(char *dev_name, char *dir_ mnt_flags |= MNT_RELATIME; if (flags & MS_NOMNT) mnt_flags |= MNT_NOMNT; + if (flags & MS_UNION) + mnt_flags |= MNT_UNION; flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | - MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_NOMNT); + MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_NOMNT | + MS_UNION); /* ... and get the mountpoint */ retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); @@ -1888,8 +2019,9 @@ asmlinkage long sys_pivot_root(const cha goto out3; detach_mnt(new_nd.mnt, &parent_nd); detach_mnt(user_nd.mnt, &root_parent); - attach_mnt(user_nd.mnt, &old_nd); /* mount old root on put_old */ - attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */ + /* TODO: Understand and pass appropriate union_mount argument here. */ + attach_mnt(user_nd.mnt, &old_nd, NULL); /* mount old root on put_old */ + attach_mnt(new_nd.mnt, &root_parent, NULL); /* mount new_root on / */ touch_mnt_namespace(current->nsproxy->mnt_ns); spin_unlock(&vfsmount_lock); chroot_fs_refs(&user_nd, &new_nd); @@ -1940,7 +2072,7 @@ static void __init init_mount_tree(void) void __init mnt_init(unsigned long mempages) { - struct list_head *d; + struct list_head *d, *e; unsigned int nr_hash; int i; int err; @@ -1976,12 +2108,24 @@ void __init mnt_init(unsigned long mempa printk("Mount-cache hash table entries: %d\n", nr_hash); + /* + * Use the same nr_hash for union mount hashtable also. + * TODO: This might need a bigger hash table. + */ + union_mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); + + if (!union_mount_hashtable) + panic("Failed to allocate union mount hash table\n"); + /* And initialize the newly allocated array */ d = mount_hashtable; + e = union_mount_hashtable; i = nr_hash; do { INIT_LIST_HEAD(d); + INIT_LIST_HEAD(e); d++; + e++; i--; } while (i); err = sysfs_init(); --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -113,6 +113,7 @@ extern int dir_notify_enable; #define MS_REMOUNT 32 /* Alter flags of a mounted FS */ #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ #define MS_DIRSYNC 128 /* Directory modifications are synchronous */ +#define MS_UNION 256 /* Union mount */ #define MS_NOATIME 1024 /* Do not update access times. */ #define MS_NODIRATIME 2048 /* Do not update directory access times */ #define MS_BIND 4096 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -36,6 +36,7 @@ struct mnt_namespace; #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ #define MNT_PNODE_MASK 0x3000 /* propagation flag mask */ +#define MNT_UNION 0x4000 /* if the vfsmount is a union mount */ struct vfsmount { struct list_head mnt_hash; @@ -53,6 +54,7 @@ struct vfsmount { struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list;/* list of slave mounts */ struct list_head mnt_slave; /* slave list entry */ + struct list_head mnt_union; /* list of union_mounts */ struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ /* @@ -107,5 +109,20 @@ extern void shrink_submounts(struct vfsm extern spinlock_t vfsmount_lock; extern dev_t name_to_dev_t(char *name); +#define IS_MNT_UNION(mnt) (mnt->mnt_flags & MNT_UNION) + +struct union_mount { + struct vfsmount *src_mnt, *dst_mnt; + struct dentry *src_dentry, *dst_dentry; + struct list_head hash, list; +}; + +extern void attach_mnt_union(struct union_mount *u); +extern struct union_mount *alloc_union_mount(struct vfsmount *src_mnt, + struct dentry *src_dentry, struct vfsmount *dst_mnt, + struct dentry *dst_dentry); +extern int next_union_mount(struct nameidata *nd); +extern int next_union_mount_exists(struct vfsmount *mnt, struct dentry *dentry); + #endif #endif /* _LINUX_MOUNT_H */ - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html