Hi Eric, I found one issue about this patch. Take a look at the next script. The idea is simple, we call mount twice and than we call umount twice, but the second umount fails. [root@fc24 ~]# cat m.sh #!/bin/sh set -x -e mount -t tmpfs xxx /mnt mkdir -p /mnt/1 mkdir -p /mnt/2 mount --bind /mnt/1 /mnt/1 mount --make-shared /mnt/1 mount --bind /mnt/1 /mnt/2 mkdir -p /mnt/1/1 for i in `seq 2`; do mount --bind /mnt/1/1 /mnt/1/1 done for i in `seq 2`; do umount /mnt/1/1 || { cat /proc/self/mountinfo | grep xxx exit 1 } done [root@fc24 ~]# unshare -Urm ./m.sh + mount -t tmpfs xxx /mnt + mkdir -p /mnt/1 + mkdir -p /mnt/2 + mount --bind /mnt/1 /mnt/1 + mount --make-shared /mnt/1 + mount --bind /mnt/1 /mnt/2 + mkdir -p /mnt/1/1 ++ seq 2 + for i in '`seq 2`' + mount --bind /mnt/1/1 /mnt/1/1 + for i in '`seq 2`' + mount --bind /mnt/1/1 /mnt/1/1 ++ seq 2 + for i in '`seq 2`' + umount /mnt/1/1 + for i in '`seq 2`' + umount /mnt/1/1 umount: /mnt/1/1: not mounted + cat /proc/self/mountinfo + grep xxx 147 116 0:42 / /mnt rw,relatime - tmpfs xxx rw 148 147 0:42 /1 /mnt/1 rw,relatime shared:65 - tmpfs xxx rw 149 147 0:42 /1 /mnt/2 rw,relatime shared:65 - tmpfs xxx rw 157 149 0:42 /1/1 /mnt/2/1 rw,relatime shared:65 - tmpfs xxx rw + exit 1 And you can see that /mnt/2 contains a mount, but it should not. Thanks, Andrei On Thu, Jan 05, 2017 at 10:04:14AM +1300, Eric W. Biederman wrote: > > Ever since mount propagation was introduced in cases where a mount in > propagated to parent mount mountpoint pair that is already in use the > code has placed the new mount behind the old mount in the mount hash > table. > > This implementation detail is problematic as it allows creating > arbitrary length mount hash chains. > > Furthermore it invalidates the constraint maintained elsewhere in the > mount code that a parent mount and a mountpoint pair will have exactly > one mount upon them. Making it hard to deal with and to talk about > this special case in the mount code. > > Modify mount propagation to notice when there is already a mount at > the parent mount and mountpoint where a new mount is propagating to > and place that preexisting mount on top of the new mount. > > Modify unmount propagation to notice when a mount that is being > unmounted has another mount on top of it (and no other children), and > to replace the unmounted mount with the mount on top of it. > > Move the MNT_UMUONT test from __lookup_mnt_last into > __propagate_umount as that is the only call of __lookup_mnt_last where > MNT_UMOUNT may be set on any mount visible in the mount hash table. > > These modifications allow: > - __lookup_mnt_last to be removed. > - attach_shadows to be renamed __attach_mnt and the it's shadow > handling to be removed. > - commit_tree to be simplified > - copy_tree to be simplified > > The result is an easier to understand tree of mounts that does not > allow creation of arbitrary length hash chains in the mount hash table. > > v2: Updated to mnt_change_mountpoint to not call dput or mntput > and instead to decrement the counts directly. It is guaranteed > that there will be other references when mnt_change_mountpoint is > called so this is safe. > > Cc: stable@xxxxxxxxxxxxxxx > Fixes: b90fa9ae8f51 ("[PATCH] shared mount handling: bind and rbind") > Tested-by: Andrei Vagin <avagin@xxxxxxxxxxxxx> > Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> > --- > > Since the last version some of you may have seen I have modified > my implementation of mnt_change_mountpoint so that it no longer calls > mntput or dput but instead relies on the knowledge that it can not > possibly have the last reference to the mnt and dentry of interest. > This avoids code checking tools from complaining bitterly. > > This is on top of my previous patch that sorts out locking of the > mountpoint hash table. After time giving ample time for review I intend > to push this and the previous bug fix to Linus. > > fs/mount.h | 1 - > fs/namespace.c | 110 +++++++++++++++++++++++++++++++-------------------------- > fs/pnode.c | 27 ++++++++++---- > fs/pnode.h | 2 ++ > 4 files changed, 82 insertions(+), 58 deletions(-) > > diff --git a/fs/mount.h b/fs/mount.h > index 2c856fc47ae3..2826543a131d 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt) > } > > extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); > -extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); > > extern int __legitimize_mnt(struct vfsmount *, unsigned); > extern bool legitimize_mnt(struct vfsmount *, unsigned); > diff --git a/fs/namespace.c b/fs/namespace.c > index 487ba30bb5c6..91ccfb73f0e0 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) > } > > /* > - * find the last mount at @dentry on vfsmount @mnt. > - * mount_lock must be held. > - */ > -struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) > -{ > - struct mount *p, *res = NULL; > - p = __lookup_mnt(mnt, dentry); > - if (!p) > - goto out; > - if (!(p->mnt.mnt_flags & MNT_UMOUNT)) > - res = p; > - hlist_for_each_entry_continue(p, mnt_hash) { > - if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) > - break; > - if (!(p->mnt.mnt_flags & MNT_UMOUNT)) > - res = p; > - } > -out: > - return res; > -} > - > -/* > * lookup_mnt - Return the first child mount mounted at path > * > * "First" means first mounted chronologically. If you create the > @@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt, > hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); > } > > +static void __attach_mnt(struct mount *mnt, struct mount *parent) > +{ > + hlist_add_head_rcu(&mnt->mnt_hash, > + m_hash(&parent->mnt, mnt->mnt_mountpoint)); > + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); > +} > + > /* > * vfsmount lock must be held for write > */ > @@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt, > struct mountpoint *mp) > { > mnt_set_mountpoint(parent, mp, mnt); > - hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); > - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); > + __attach_mnt(mnt, parent); > } > > -static void attach_shadowed(struct mount *mnt, > - struct mount *parent, > - struct mount *shadows) > +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) > { > - if (shadows) { > - hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); > - list_add(&mnt->mnt_child, &shadows->mnt_child); > - } else { > - hlist_add_head_rcu(&mnt->mnt_hash, > - m_hash(&parent->mnt, mnt->mnt_mountpoint)); > - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); > - } > + struct mountpoint *old_mp = mnt->mnt_mp; > + struct dentry *old_mountpoint = mnt->mnt_mountpoint; > + struct mount *old_parent = mnt->mnt_parent; > + > + list_del_init(&mnt->mnt_child); > + hlist_del_init(&mnt->mnt_mp_list); > + hlist_del_init_rcu(&mnt->mnt_hash); > + > + attach_mnt(mnt, parent, mp); > + > + put_mountpoint(old_mp); > + > + /* > + * Safely avoid even the suggestion this code might sleep or > + * lock the mount hash by taking avantage of the knowlege that > + * mnt_change_mounpoint will not release the final reference > + * to a mountpoint. > + * > + * During mounting, another mount will continue to use the old > + * mountpoint and during unmounting, the old mountpoint will > + * continue to exist until namespace_unlock which happens well > + * after mnt_change_mountpoint. > + */ > + spin_lock(&old_mountpoint->d_lock); > + old_mountpoint->d_lockref.count--; > + spin_unlock(&old_mountpoint->d_lock); > + > + mnt_add_count(old_parent, -1); > } > > /* > * vfsmount lock must be held for write > */ > -static void commit_tree(struct mount *mnt, struct mount *shadows) > +static void commit_tree(struct mount *mnt) > { > struct mount *parent = mnt->mnt_parent; > struct mount *m; > @@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) > n->mounts += n->pending_mounts; > n->pending_mounts = 0; > > - attach_shadowed(mnt, parent, shadows); > + __attach_mnt(mnt, parent); > touch_mnt_namespace(n); > } > > @@ -1764,7 +1766,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, > continue; > > for (s = r; s; s = next_mnt(s, r)) { > - struct mount *t = NULL; > if (!(flag & CL_COPY_UNBINDABLE) && > IS_MNT_UNBINDABLE(s)) { > s = skip_mnt_tree(s); > @@ -1786,14 +1787,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, > goto out; > lock_mount_hash(); > list_add_tail(&q->mnt_list, &res->mnt_list); > - mnt_set_mountpoint(parent, p->mnt_mp, q); > - if (!list_empty(&parent->mnt_mounts)) { > - t = list_last_entry(&parent->mnt_mounts, > - struct mount, mnt_child); > - if (t->mnt_mp != p->mnt_mp) > - t = NULL; > - } > - attach_shadowed(q, parent, t); > + attach_mnt(q, parent, p->mnt_mp); > unlock_mount_hash(); > } > } > @@ -1992,10 +1986,18 @@ static int attach_recursive_mnt(struct mount *source_mnt, > { > HLIST_HEAD(tree_list); > struct mnt_namespace *ns = dest_mnt->mnt_ns; > + struct mountpoint *smp; > struct mount *child, *p; > struct hlist_node *n; > int err; > > + /* Preallocate a mountpoint in case the new mounts need > + * to be tucked under other mounts. > + */ > + smp = get_mountpoint(source_mnt->mnt.mnt_root); > + if (IS_ERR(smp)) > + return PTR_ERR(smp); > + > /* Is there space to add these mounts to the mount namespace? */ > if (!parent_path) { > err = count_mounts(ns, source_mnt); > @@ -2022,17 +2024,22 @@ static int attach_recursive_mnt(struct mount *source_mnt, > touch_mnt_namespace(source_mnt->mnt_ns); > } else { > mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); > - commit_tree(source_mnt, NULL); > + commit_tree(source_mnt); > } > > hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { > - struct mount *q; > hlist_del_init(&child->mnt_hash); > - q = __lookup_mnt_last(&child->mnt_parent->mnt, > - child->mnt_mountpoint); > - commit_tree(child, q); > + if (child->mnt.mnt_root == smp->m_dentry) { > + struct mount *q; > + q = __lookup_mnt(&child->mnt_parent->mnt, > + child->mnt_mountpoint); > + if (q) > + mnt_change_mountpoint(child, smp, q); > + } > + commit_tree(child); > } > unlock_mount_hash(); > + put_mountpoint(smp); > > return 0; > > @@ -2046,6 +2053,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, > cleanup_group_ids(source_mnt, NULL); > out: > ns->pending_mounts = 0; > + put_mountpoint(smp); > return err; > } > > diff --git a/fs/pnode.c b/fs/pnode.c > index 06a793f4ae38..eb4331240fd1 100644 > --- a/fs/pnode.c > +++ b/fs/pnode.c > @@ -327,6 +327,9 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, > */ > static inline int do_refcount_check(struct mount *mnt, int count) > { > + struct mount *topper = __lookup_mnt(&mnt->mnt, mnt->mnt.mnt_root); > + if (topper) > + count++; > return mnt_get_count(mnt) > count; > } > > @@ -359,7 +362,7 @@ int propagate_mount_busy(struct mount *mnt, int refcnt) > > for (m = propagation_next(parent, parent); m; > m = propagation_next(m, parent)) { > - child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); > + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); > if (child && list_empty(&child->mnt_mounts) && > (ret = do_refcount_check(child, 1))) > break; > @@ -381,7 +384,7 @@ void propagate_mount_unlock(struct mount *mnt) > > for (m = propagation_next(parent, parent); m; > m = propagation_next(m, parent)) { > - child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); > + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); > if (child) > child->mnt.mnt_flags &= ~MNT_LOCKED; > } > @@ -399,9 +402,11 @@ static void mark_umount_candidates(struct mount *mnt) > > for (m = propagation_next(parent, parent); m; > m = propagation_next(m, parent)) { > - struct mount *child = __lookup_mnt_last(&m->mnt, > + struct mount *child = __lookup_mnt(&m->mnt, > mnt->mnt_mountpoint); > - if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { > + if (!child || (child->mnt.mnt_flags & MNT_UMOUNT)) > + continue; > + if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) { > SET_MNT_MARK(child); > } > } > @@ -420,8 +425,8 @@ static void __propagate_umount(struct mount *mnt) > > for (m = propagation_next(parent, parent); m; > m = propagation_next(m, parent)) { > - > - struct mount *child = __lookup_mnt_last(&m->mnt, > + struct mount *topper; > + struct mount *child = __lookup_mnt(&m->mnt, > mnt->mnt_mountpoint); > /* > * umount the child only if the child has no children > @@ -430,6 +435,16 @@ static void __propagate_umount(struct mount *mnt) > if (!child || !IS_MNT_MARKED(child)) > continue; > CLEAR_MNT_MARK(child); > + > + /* If there is exactly one mount covering all of child > + * replace child with that mount. > + */ > + topper = __lookup_mnt(&child->mnt, child->mnt.mnt_root); > + if (topper && > + (child->mnt_mounts.next == &topper->mnt_child) && > + (topper->mnt_child.next == &child->mnt_mounts)) > + mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, topper); > + > if (list_empty(&child->mnt_mounts)) { > list_del_init(&child->mnt_child); > child->mnt.mnt_flags |= MNT_UMOUNT; > diff --git a/fs/pnode.h b/fs/pnode.h > index 550f5a8b4fcf..dc87e65becd2 100644 > --- a/fs/pnode.h > +++ b/fs/pnode.h > @@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root); > unsigned int mnt_get_count(struct mount *mnt); > void mnt_set_mountpoint(struct mount *, struct mountpoint *, > struct mount *); > +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, > + struct mount *mnt); > struct mount *copy_tree(struct mount *, struct dentry *, int); > bool is_path_reachable(struct mount *, struct dentry *, > const struct path *root); > -- > 2.10.1 >