Re: [REVIEW][PATCH 2/2] mnt: Make propagate_umount less slow for overlapping mount propagation trees

Andrei Vagin <avagin@xxxxxxxxxxxxx> · Wed, 17 May 2017 15:48:54 -0700

Hi Eric,

I tested both patches and I haven't found any issue. Thanks.

On Wed, May 17, 2017 at 12:55:16AM -0500, Eric W. Biederman wrote:
> 
> Andrei Vagin pointed out that time to executue propagate_umount can go
> non-linear (and take a ludicrious amount of time) when the mount
> propogation trees of the mounts to be unmunted by a lazy unmount
> overlap.
> 
> Make the walk of the mount propagation trees nearly linear by
> remembering which mounts have already been visited, allowing
> subsequent walks to detect when walking a mount propgation tree or a
> subtree of a mount propgation tree would be duplicate work and to skip
> them entirely.
> 
> Walk the list of mounts whose propgatation trees need to be traversed
> from the mount highest in the mount tree to mounts lower in the mount
> tree so that odds are higher that the code will walk the largest trees
> first, allowing later tree walks to be skipped entirely.
> 
> Add cleanup_umount_visitation to remover the code's memory of which
> mounts have been visited.
> 
> Add the functions last_slave and skip_propagation_subtree to allow
> skipping appropriate parts of the mount propagation tree without
> needing to change the logic of the rest of the code.
> 
> A script to generate overlapping mount propagation trees:
> 
> $ cat runs.h
> set -e
> mount -t tmpfs zdtm /mnt
> mkdir -p /mnt/1 /mnt/2
> mount -t tmpfs zdtm /mnt/1
> mount --make-shared /mnt/1
> mkdir /mnt/1/1
> 
> iteration=10
> if [ -n "$1" ] ; then
> 	iteration=$1
> fi
> 
> for i in $(seq $iteration); do
> 	mount --bind /mnt/1/1 /mnt/1/1
> done
> 
> mount --rbind /mnt/1 /mnt/2
> 
> TIMEFORMAT='%Rs'
> nr=$(( ( 2 ** ( $iteration + 1 ) ) + 1 ))
> echo -n "umount -l /mnt/1 -> $nr        "
> time umount -l /mnt/1
> 
> nr=$(cat /proc/self/mountinfo | grep zdtm | wc -l )
> time umount -l /mnt/2
> 
> $ for i in $(seq 9 19); do echo $i; unshare -Urm bash ./run.sh $i; done
> 
> Here are the performance numbers with and without the patch:
> 
>      mhash |  8192   |  8192  | 1048576 | 1048576
>     mounts | before  | after  |  before | after
>     ------------------------------------------------
>       1025 |  0.040s | 0.016s |  0.038s | 0.019s
>       2049 |  0.094s | 0.017s |  0.080s | 0.018s
>       4097 |  0.243s | 0.019s |  0.206s | 0.023s
>       8193 |  1.202s | 0.028s |  1.562s | 0.032s
>      16385 |  9.635s | 0.036s |  9.952s | 0.041s
>      32769 | 60.928s | 0.063s | 44.321s | 0.064s
>      65537 |         | 0.097s |         | 0.097s
>     131073 |         | 0.233s |         | 0.176s
>     262145 |         | 0.653s |         | 0.344s
>     524289 |         | 2.305s |         | 0.735s
>    1048577 |         | 7.107s |         | 2.603s
> 
> Andrei Vagin reports fixing the performance problem is part of the
> work to fix CVE-2016-6213.
> 
> Cc: stable@xxxxxxxxxxxxxxx
> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> Reported-by: Andrei Vagin <avagin@xxxxxxxxxx>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
> ---
>  fs/pnode.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 62 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/pnode.c b/fs/pnode.c
> index fbaca7df2eb0..53d411a371ce 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p)
>  	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
>  }
>  
> +static inline struct mount *last_slave(struct mount *p)
> +{
> +	return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
> +}
> +
>  static inline struct mount *next_slave(struct mount *p)
>  {
>  	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
> @@ -162,6 +167,19 @@ static struct mount *propagation_next(struct mount *m,
>  	}
>  }
>  
> +static struct mount *skip_propagation_subtree(struct mount *m,
> +						struct mount *origin)
> +{
> +	/*
> +	 * Advance m such that propagation_next will not return
> +	 * the slaves of m.
> +	 */
> +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
> +		m = last_slave(m);
> +
> +	return m;
> +}
> +
>  static struct mount *next_group(struct mount *m, struct mount *origin)
>  {
>  	while (1) {
> @@ -505,6 +523,15 @@ static void restore_mounts(struct list_head *to_restore)
>  	}
>  }
>  
> +static void cleanup_umount_visitations(struct list_head *visited)
> +{
> +	while (!list_empty(visited)) {
> +		struct mount *mnt =
> +			list_first_entry(visited, struct mount, mnt_umounting);
> +		list_del_init(&mnt->mnt_umounting);
> +	}
> +}
> +
>  /*
>   * collect all mounts that receive propagation from the mount in @list,
>   * and return these additional mounts in the same list.
> @@ -517,11 +544,23 @@ int propagate_umount(struct list_head *list)
>  	struct mount *mnt;
>  	LIST_HEAD(to_restore);
>  	LIST_HEAD(to_umount);
> +	LIST_HEAD(visited);
>  
> -	list_for_each_entry(mnt, list, mnt_list) {
> +	/* Find candidates for unmounting */
> +	list_for_each_entry_reverse(mnt, list, mnt_list) {
>  		struct mount *parent = mnt->mnt_parent;
>  		struct mount *m;
>  
> +		/*
> +		 * If this mount has already been visited it is known that it's
> +		 * entire peer group and all of their slaves in the propagation
> +		 * tree for the mountpoint has already been visited and there is
> +		 * no need to visit them again.
> +		 */
> +		if (!list_empty(&mnt->mnt_umounting))
> +			continue;
> +
> +		list_add_tail(&mnt->mnt_umounting, &visited);
>  		for (m = propagation_next(parent, parent); m;
>  		     m = propagation_next(m, parent)) {
>  			struct mount *child = __lookup_mnt(&m->mnt,
> @@ -529,6 +568,27 @@ int propagate_umount(struct list_head *list)
>  			if (!child)
>  				continue;
>  
> +			if (!list_empty(&child->mnt_umounting)) {
> +				/*
> +				 * If the child has already been visited it is
> +				 * know that it's entire peer group and all of
> +				 * their slaves in the propgation tree for the
> +				 * mountpoint has already been visited and there
> +				 * is no need to visit this subtree again.
> +				 */
> +				m = skip_propagation_subtree(m, parent);
> +				continue;
> +			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
> +				/*
> +				 * We have come accross an partially unmounted
> +				 * mount in list that has not been visited yet.
> +				 * Remember it has been visited and continue
> +				 * about our merry way.
> +				 */
> +				list_add_tail(&child->mnt_umounting, &visited);
> +				continue;
> +			}
> +
>  			/* Check the child and parents while progress is made */
>  			while (__propagate_umount(child,
>  						  &to_umount, &to_restore)) {
> @@ -542,6 +602,7 @@ int propagate_umount(struct list_head *list)
>  
>  	umount_list(&to_umount, &to_restore);
>  	restore_mounts(&to_restore);
> +	cleanup_umount_visitations(&visited);
>  	list_splice_tail(&to_umount, list);
>  
>  	return 0;
> -- 
> 2.10.1
>