On Fri, 2017-05-12 at 00:08 -0700, Andrei Vagin wrote: > With this patch, we don't try to umount all mounts of a tree together. > Instead of this we umount them one by one. In this case, we see a significant > improvement in performance for the worsе case. Indeed, umount has been very slow for a while now. Even a moderately large number of mounts (~10000) become painfully slow. Re you still perusing this? Anything I can do to help? Eric, what are your thoughts on this latest attempt? > > The reason of this optimization is that umount() can hold namespace_sem > for a long time, this semaphore is global, so it affects all users. > Recently Eric W. Biederman added a per mount namespace limit on the > number of mounts. The default number of mounts allowed per mount > namespace at 100,000. Currently this value is allowed to construct a tree > which requires hours to be umounted. > > In a worse case the current complexity of umount_tree() is O(n^3). > * Enumirate all mounts in a target tree (propagate_umount) > * Enumirate mounts to find where these changes have to > be propagated (mark_umount_candidates) > * Enumirate mounts to find a requered mount by parent and dentry > (__lookup_mnt). __lookup_mnt() searches a mount in m_hash, but > the number of mounts is much bigger than a size of the hash. > > The worse case is when all mounts from the tree live in the same shared > group. In this case we have to enumirate all mounts on each step. > > There is CVE-2016-6213 about this case. > > Here are results for the kernel with this patch > $ for i in `seq 10 15`; do unshare -m sh ./run.sh $i; done > umount -l /mnt/1 -> 0:00.00 > umount -l /mnt/1 -> 0:00.01 > umount -l /mnt/1 -> 0:00.01 > umount -l /mnt/1 -> 0:00.03 > umount -l /mnt/1 -> 0:00.07 > umount -l /mnt/1 -> 0:00.14 > > Here are results for the kernel without this patch > $ for i in `seq 10 15`; do unshare -m sh ./run.sh $i; done > umount -l /mnt/1 -> 0:00.04 > umount -l /mnt/1 -> 0:00.17 > umount -l /mnt/1 -> 0:00.75 > umount -l /mnt/1 -> 0:05.96 > umount -l /mnt/1 -> 0:34.40 > umount -l /mnt/1 -> 3:46.27 > > And here is a test script: > $ cat run.sh > set -e -m > > mount -t tmpfs zdtm /mnt > mkdir -p /mnt/1 /mnt/2 > mount -t tmpfs zdtm /mnt/1 > mount --make-shared /mnt/1 > mkdir /mnt/1/1 > > for i in `seq $1`; do > ./mount --bind /mnt/1/1 /mnt/1/1 > done > > echo -n "umount -l /mnt/1 -> " > /usr/bin/time -f '%E' ./umount -l /mnt/1 > > And we need these simple mount and umount tools, because the standard > ones read /proc/self/mountinfo, but this is extremely slow when we have > thousands of mounts. > $ cat mount.c > #include <sys/mount.h> > #include <stdlib.h> > > int main(int argc, char **argv) > { > return mount(argv[2], argv[3], NULL, MS_BIND, NULL); > } > > $ cat umount.c > #include <sys/mount.h> > > int main(int argc, char **argv) > { > return umount2(argv[2], MNT_DETACH); > } > > Here is a previous attempt to optimize this code: > https://lkml.org/lkml/2016/10/10/495 > > Signed-off-by: Andrei Vagin <avagin@xxxxxxxxxx> > --- > fs/namespace.c | 81 +++++++++++++++++++++++++++++++------------------------ > --- > 1 file changed, 43 insertions(+), 38 deletions(-) > > diff --git a/fs/namespace.c b/fs/namespace.c > index 3bf0cd2..4e6f258 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -1474,56 +1474,61 @@ static bool disconnect_mount(struct mount *mnt, enum > umount_tree_flags how) > */ > static void umount_tree(struct mount *mnt, enum umount_tree_flags how) > { > - LIST_HEAD(tmp_list); > struct mount *p; > + int done = 0; > > if (how & UMOUNT_PROPAGATE) > propagate_mount_unlock(mnt); > > /* Gather the mounts to umount */ > - for (p = mnt; p; p = next_mnt(p, mnt)) { > + while (!done) { > + LIST_HEAD(tmp_list); > + > + p = mnt; > + while (!list_empty(&p->mnt_mounts)) > + p = list_entry(p->mnt_mounts.next, struct mount, > mnt_child); > + > p->mnt.mnt_flags |= MNT_UMOUNT; > list_move(&p->mnt_list, &tmp_list); > - } > - > - /* Hide the mounts from mnt_mounts */ > - list_for_each_entry(p, &tmp_list, mnt_list) { > list_del_init(&p->mnt_child); > - } > > - /* Add propogated mounts to the tmp_list */ > - if (how & UMOUNT_PROPAGATE) > - propagate_umount(&tmp_list); > - > - while (!list_empty(&tmp_list)) { > - struct mnt_namespace *ns; > - bool disconnect; > - p = list_first_entry(&tmp_list, struct mount, mnt_list); > - list_del_init(&p->mnt_expire); > - list_del_init(&p->mnt_list); > - ns = p->mnt_ns; > - if (ns) { > - ns->mounts--; > - __touch_mnt_namespace(ns); > - } > - p->mnt_ns = NULL; > - if (how & UMOUNT_SYNC) > - p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; > - > - disconnect = disconnect_mount(p, how); > - > - pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, > - disconnect ? &unmounted : NULL); > - if (mnt_has_parent(p)) { > - mnt_add_count(p->mnt_parent, -1); > - if (!disconnect) { > - /* Don't forget about p */ > - list_add_tail(&p->mnt_child, &p->mnt_parent- > >mnt_mounts); > - } else { > - umount_mnt(p); > + /* Add propogated mounts to the tmp_list */ > + if (how & UMOUNT_PROPAGATE) > + propagate_umount(&tmp_list); > + > + if (p == mnt) > + done = 1; > + > + while (!list_empty(&tmp_list)) { > + struct mnt_namespace *ns; > + bool disconnect; > + p = list_first_entry(&tmp_list, struct mount, > mnt_list); > + list_del_init(&p->mnt_expire); > + list_del_init(&p->mnt_list); > + ns = p->mnt_ns; > + if (ns) { > + ns->mounts--; > + __touch_mnt_namespace(ns); > + } > + p->mnt_ns = NULL; > + if (how & UMOUNT_SYNC) > + p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; > + > + disconnect = disconnect_mount(p, how); > + > + pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, > + disconnect ? &unmounted : NULL); > + if (mnt_has_parent(p)) { > + mnt_add_count(p->mnt_parent, -1); > + if (!disconnect) { > + /* Don't forget about p */ > + list_add_tail(&p->mnt_child, &p- > >mnt_parent->mnt_mounts); > + } else { > + umount_mnt(p); > + } > } > + change_mnt_propagation(p, MS_PRIVATE); > } > - change_mnt_propagation(p, MS_PRIVATE); > } > } >