Add an xarray to the mount namespace and use this to perform a mnt_id to mount object mapping for the namespace. Make use of xa_reserve() to perform preallocation before taking the mount_lock. This will allow the set of mount objects in a namespace to be iterated using xarray iteration and without the need to insert and remove fake mounts as bookmarks - which cause issues for other trawlers of the list. As a bonus, if we want to allow it, lseek() can be used to start at a particular mount - though there's no easy way to limit the return to just a single entry or enforce a failure if that mount doesn't exist, but a later one does. Signed-off-by: David Howells <dhowells@xxxxxxxxxx> cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx> cc: Miklos Szeredi <miklos@xxxxxxxxxx> cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> --- fs/mount.h | 2 + fs/namespace.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 74 insertions(+), 9 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 0b6e08cf8afb..455f4d293a65 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -4,6 +4,7 @@ #include <linux/poll.h> #include <linux/ns_common.h> #include <linux/fs_pin.h> +#include <linux/xarray.h> struct mnt_namespace { struct ns_common ns; @@ -14,6 +15,7 @@ struct mnt_namespace { * - taking namespace_sem for read AND taking .ns_lock. */ struct list_head list; + struct xarray mounts_by_id; /* List of mounts by mnt_id */ spinlock_t ns_lock; struct user_namespace *user_ns; struct ucounts *ucounts; diff --git a/fs/namespace.c b/fs/namespace.c index 56bb5a5fdc0d..5c9bcaeac4de 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -901,6 +901,57 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m mnt_add_count(old_parent, -1); } +/* + * Reserve slots in the mnt_id-to-mount mapping in a namespace. This gets the + * memory allocation done upfront. + */ +static int reserve_mnt_id_one(struct mount *mnt, struct mnt_namespace *ns) +{ + struct mount *m; + int ret; + + ret = xa_reserve(&ns->mounts_by_id, mnt->mnt_id, GFP_KERNEL); + if (ret < 0) + return ret; + + list_for_each_entry(m, &mnt->mnt_list, mnt_list) { + ret = xa_reserve(&ns->mounts_by_id, m->mnt_id, GFP_KERNEL); + if (ret < 0) + return ret; + } + + return 0; +} + +static int reserve_mnt_id_list(struct hlist_head *tree_list) +{ + struct mount *child; + int ret; + + hlist_for_each_entry(child, tree_list, mnt_hash) { + ret = reserve_mnt_id_one(child, child->mnt_parent->mnt_ns); + if (ret < 0) + return ret; + } + return 0; +} + +static void add_mnt_to_ns(struct mount *m, struct mnt_namespace *ns) +{ + void *x; + + m->mnt_ns = ns; + x = xa_store(&ns->mounts_by_id, m->mnt_id, m, GFP_ATOMIC); + WARN(xa_err(x), "Couldn't store mnt_id %x\n", m->mnt_id); +} + +static void remove_mnt_from_ns(struct mount *mnt) +{ + if (mnt->mnt_ns && mnt->mnt_ns != MNT_NS_INTERNAL) + xa_erase(&mnt->mnt_ns->mounts_by_id, mnt->mnt_id); + mnt->mnt_ns = NULL; +} + /* * vfsmount lock must be held for write */ @@ -914,8 +965,9 @@ static void commit_tree(struct mount *mnt) BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list); - list_for_each_entry(m, &head, mnt_list) - m->mnt_ns = n; + list_for_each_entry(m, &head, mnt_list) { + add_mnt_to_ns(m, n); + } list_splice(&head, n->list.prev); @@ -1529,7 +1581,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) ns->mounts--; __touch_mnt_namespace(ns); } - p->mnt_ns = NULL; + remove_mnt_from_ns(p); if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; @@ -2144,6 +2196,13 @@ static int attach_recursive_mnt(struct mount *source_mnt, err = count_mounts(ns, source_mnt); if (err) goto out; + + /* Reserve id-to-mount mapping slots in the namespace we're + * going to use. + */ + err = reserve_mnt_id_one(source_mnt, dest_mnt->mnt_ns); + if (err) + goto out; } if (IS_MNT_SHARED(dest_mnt)) { @@ -2151,6 +2210,8 @@ static int attach_recursive_mnt(struct mount *source_mnt, if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); + if (!err && !moving) + err = reserve_mnt_id_list(&tree_list); lock_mount_hash(); if (err) goto out_cleanup_ids; @@ -3260,6 +3321,7 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { + WARN_ON(!xa_empty(&ns->mounts_by_id)); if (!is_anon_ns(ns)) ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts); @@ -3306,6 +3368,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); spin_lock_init(&new_ns->ns_lock); + xa_init(&new_ns->mounts_by_id); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; @@ -3362,7 +3425,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, p = old; q = new; while (p) { - q->mnt_ns = new_ns; + add_mnt_to_ns(q, new_ns); new_ns->mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { @@ -3404,7 +3467,7 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name) mntput(m); return ERR_CAST(ns); } - mnt->mnt_ns = ns; + add_mnt_to_ns(mnt, ns); ns->root = mnt; ns->mounts++; list_add(&mnt->mnt_list, &ns->list); @@ -3583,7 +3646,7 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, goto err_path; } mnt = real_mount(newmount.mnt); - mnt->mnt_ns = ns; + add_mnt_to_ns(mnt, ns); ns->root = mnt; ns->mounts = 1; list_add(&mnt->mnt_list, &ns->list); @@ -4193,7 +4256,7 @@ static void __init init_mount_tree(void) if (IS_ERR(ns)) panic("Can't allocate initial namespace"); m = real_mount(mnt); - m->mnt_ns = ns; + add_mnt_to_ns(m, ns); ns->root = m; ns->mounts = 1; list_add(&m->mnt_list, &ns->list); @@ -4270,7 +4333,7 @@ void kern_unmount(struct vfsmount *mnt) { /* release long term mount so mount point can be released */ if (!IS_ERR_OR_NULL(mnt)) { - real_mount(mnt)->mnt_ns = NULL; + remove_mnt_from_ns(real_mount(mnt)); synchronize_rcu(); /* yecchhh... */ mntput(mnt); } @@ -4283,7 +4346,7 @@ void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) for (i = 0; i < num; i++) if (mnt[i]) - real_mount(mnt[i])->mnt_ns = NULL; + remove_mnt_from_ns(real_mount(mnt[i])); synchronize_rcu_expedited(); for (i = 0; i < num; i++) mntput(mnt[i]);