Re: [REVIEW][PATCH 2/2] mnt: Make propagate_umount less slow for overlapping mount propagation trees

Andrei Vagin <avagin@xxxxxxxxxxxxx> · Wed, 17 May 2017 17:51:49 -0700

On Wed, May 17, 2017 at 06:26:09PM -0500, Eric W. Biederman wrote:
> Andrei Vagin <avagin@xxxxxxxxxxxxx> writes:
> 
> > Hi Eric,
> >
> > I tested both patches and I haven't found any issue. Thanks.
> 
> Can I get a Tested-by an Acked-by or a Reviewed-by?

Sure. I read patches and they look good for me.

Reviewed-by: Andrei Vagin <avagin@xxxxxxxxxxxxx>

Thanks,
Andrei

> 
> Apologies this took so long to get to this point until I realized that
> we could move mnt_change_mountpoint into a separate pass this didn't
> look possible.
> 
> Eric
> 
> >
> > On Wed, May 17, 2017 at 12:55:16AM -0500, Eric W. Biederman wrote:
> >> 
> >> Andrei Vagin pointed out that time to executue propagate_umount can go
> >> non-linear (and take a ludicrious amount of time) when the mount
> >> propogation trees of the mounts to be unmunted by a lazy unmount
> >> overlap.
> >> 
> >> Make the walk of the mount propagation trees nearly linear by
> >> remembering which mounts have already been visited, allowing
> >> subsequent walks to detect when walking a mount propgation tree or a
> >> subtree of a mount propgation tree would be duplicate work and to skip
> >> them entirely.
> >> 
> >> Walk the list of mounts whose propgatation trees need to be traversed
> >> from the mount highest in the mount tree to mounts lower in the mount
> >> tree so that odds are higher that the code will walk the largest trees
> >> first, allowing later tree walks to be skipped entirely.
> >> 
> >> Add cleanup_umount_visitation to remover the code's memory of which
> >> mounts have been visited.
> >> 
> >> Add the functions last_slave and skip_propagation_subtree to allow
> >> skipping appropriate parts of the mount propagation tree without
> >> needing to change the logic of the rest of the code.
> >> 
> >> A script to generate overlapping mount propagation trees:
> >> 
> >> $ cat runs.h
> >> set -e
> >> mount -t tmpfs zdtm /mnt
> >> mkdir -p /mnt/1 /mnt/2
> >> mount -t tmpfs zdtm /mnt/1
> >> mount --make-shared /mnt/1
> >> mkdir /mnt/1/1
> >> 
> >> iteration=10
> >> if [ -n "$1" ] ; then
> >> 	iteration=$1
> >> fi
> >> 
> >> for i in $(seq $iteration); do
> >> 	mount --bind /mnt/1/1 /mnt/1/1
> >> done
> >> 
> >> mount --rbind /mnt/1 /mnt/2
> >> 
> >> TIMEFORMAT='%Rs'
> >> nr=$(( ( 2 ** ( $iteration + 1 ) ) + 1 ))
> >> echo -n "umount -l /mnt/1 -> $nr        "
> >> time umount -l /mnt/1
> >> 
> >> nr=$(cat /proc/self/mountinfo | grep zdtm | wc -l )
> >> time umount -l /mnt/2
> >> 
> >> $ for i in $(seq 9 19); do echo $i; unshare -Urm bash ./run.sh $i; done
> >> 
> >> Here are the performance numbers with and without the patch:
> >> 
> >>      mhash |  8192   |  8192  | 1048576 | 1048576
> >>     mounts | before  | after  |  before | after
> >>     ------------------------------------------------
> >>       1025 |  0.040s | 0.016s |  0.038s | 0.019s
> >>       2049 |  0.094s | 0.017s |  0.080s | 0.018s
> >>       4097 |  0.243s | 0.019s |  0.206s | 0.023s
> >>       8193 |  1.202s | 0.028s |  1.562s | 0.032s
> >>      16385 |  9.635s | 0.036s |  9.952s | 0.041s
> >>      32769 | 60.928s | 0.063s | 44.321s | 0.064s
> >>      65537 |         | 0.097s |         | 0.097s
> >>     131073 |         | 0.233s |         | 0.176s
> >>     262145 |         | 0.653s |         | 0.344s
> >>     524289 |         | 2.305s |         | 0.735s
> >>    1048577 |         | 7.107s |         | 2.603s
> >> 
> >> Andrei Vagin reports fixing the performance problem is part of the
> >> work to fix CVE-2016-6213.
> >> 
> >> Cc: stable@xxxxxxxxxxxxxxx
> >> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> >> Reported-by: Andrei Vagin <avagin@xxxxxxxxxx>
> >> Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
> >> ---
> >>  fs/pnode.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >>  1 file changed, 62 insertions(+), 1 deletion(-)
> >> 
> >> diff --git a/fs/pnode.c b/fs/pnode.c
> >> index fbaca7df2eb0..53d411a371ce 100644
> >> --- a/fs/pnode.c
> >> +++ b/fs/pnode.c
> >> @@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p)
> >>  	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
> >>  }
> >>  
> >> +static inline struct mount *last_slave(struct mount *p)
> >> +{
> >> +	return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
> >> +}
> >> +
> >>  static inline struct mount *next_slave(struct mount *p)
> >>  {
> >>  	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
> >> @@ -162,6 +167,19 @@ static struct mount *propagation_next(struct mount *m,
> >>  	}
> >>  }
> >>  
> >> +static struct mount *skip_propagation_subtree(struct mount *m,
> >> +						struct mount *origin)
> >> +{
> >> +	/*
> >> +	 * Advance m such that propagation_next will not return
> >> +	 * the slaves of m.
> >> +	 */
> >> +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
> >> +		m = last_slave(m);
> >> +
> >> +	return m;
> >> +}
> >> +
> >>  static struct mount *next_group(struct mount *m, struct mount *origin)
> >>  {
> >>  	while (1) {
> >> @@ -505,6 +523,15 @@ static void restore_mounts(struct list_head *to_restore)
> >>  	}
> >>  }
> >>  
> >> +static void cleanup_umount_visitations(struct list_head *visited)
> >> +{
> >> +	while (!list_empty(visited)) {
> >> +		struct mount *mnt =
> >> +			list_first_entry(visited, struct mount, mnt_umounting);
> >> +		list_del_init(&mnt->mnt_umounting);
> >> +	}
> >> +}
> >> +
> >>  /*
> >>   * collect all mounts that receive propagation from the mount in @list,
> >>   * and return these additional mounts in the same list.
> >> @@ -517,11 +544,23 @@ int propagate_umount(struct list_head *list)
> >>  	struct mount *mnt;
> >>  	LIST_HEAD(to_restore);
> >>  	LIST_HEAD(to_umount);
> >> +	LIST_HEAD(visited);
> >>  
> >> -	list_for_each_entry(mnt, list, mnt_list) {
> >> +	/* Find candidates for unmounting */
> >> +	list_for_each_entry_reverse(mnt, list, mnt_list) {
> >>  		struct mount *parent = mnt->mnt_parent;
> >>  		struct mount *m;
> >>  
> >> +		/*
> >> +		 * If this mount has already been visited it is known that it's
> >> +		 * entire peer group and all of their slaves in the propagation
> >> +		 * tree for the mountpoint has already been visited and there is
> >> +		 * no need to visit them again.
> >> +		 */
> >> +		if (!list_empty(&mnt->mnt_umounting))
> >> +			continue;
> >> +
> >> +		list_add_tail(&mnt->mnt_umounting, &visited);
> >>  		for (m = propagation_next(parent, parent); m;
> >>  		     m = propagation_next(m, parent)) {
> >>  			struct mount *child = __lookup_mnt(&m->mnt,
> >> @@ -529,6 +568,27 @@ int propagate_umount(struct list_head *list)
> >>  			if (!child)
> >>  				continue;
> >>  
> >> +			if (!list_empty(&child->mnt_umounting)) {
> >> +				/*
> >> +				 * If the child has already been visited it is
> >> +				 * know that it's entire peer group and all of
> >> +				 * their slaves in the propgation tree for the
> >> +				 * mountpoint has already been visited and there
> >> +				 * is no need to visit this subtree again.
> >> +				 */
> >> +				m = skip_propagation_subtree(m, parent);
> >> +				continue;
> >> +			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
> >> +				/*
> >> +				 * We have come accross an partially unmounted
> >> +				 * mount in list that has not been visited yet.
> >> +				 * Remember it has been visited and continue
> >> +				 * about our merry way.
> >> +				 */
> >> +				list_add_tail(&child->mnt_umounting, &visited);
> >> +				continue;
> >> +			}
> >> +
> >>  			/* Check the child and parents while progress is made */
> >>  			while (__propagate_umount(child,
> >>  						  &to_umount, &to_restore)) {
> >> @@ -542,6 +602,7 @@ int propagate_umount(struct list_head *list)
> >>  
> >>  	umount_list(&to_umount, &to_restore);
> >>  	restore_mounts(&to_restore);
> >> +	cleanup_umount_visitations(&visited);
> >>  	list_splice_tail(&to_umount, list);
> >>  
> >>  	return 0;
> >> -- 
> >> 2.10.1
> >>