Re: [REVIEW][PATCH 2/2] mnt: Make propagate_umount less slow for overlapping mount propagation trees

ebiederm@xxxxxxxxxxxx (Eric W. Biederman) · Wed, 17 May 2017 18:26:09 -0500

Andrei Vagin <avagin@xxxxxxxxxxxxx> writes:

> Hi Eric,
>
> I tested both patches and I haven't found any issue. Thanks.

Can I get a Tested-by an Acked-by or a Reviewed-by?

Apologies this took so long to get to this point until I realized that
we could move mnt_change_mountpoint into a separate pass this didn't
look possible.

Eric

>
> On Wed, May 17, 2017 at 12:55:16AM -0500, Eric W. Biederman wrote:
>> 
>> Andrei Vagin pointed out that time to executue propagate_umount can go
>> non-linear (and take a ludicrious amount of time) when the mount
>> propogation trees of the mounts to be unmunted by a lazy unmount
>> overlap.
>> 
>> Make the walk of the mount propagation trees nearly linear by
>> remembering which mounts have already been visited, allowing
>> subsequent walks to detect when walking a mount propgation tree or a
>> subtree of a mount propgation tree would be duplicate work and to skip
>> them entirely.
>> 
>> Walk the list of mounts whose propgatation trees need to be traversed
>> from the mount highest in the mount tree to mounts lower in the mount
>> tree so that odds are higher that the code will walk the largest trees
>> first, allowing later tree walks to be skipped entirely.
>> 
>> Add cleanup_umount_visitation to remover the code's memory of which
>> mounts have been visited.
>> 
>> Add the functions last_slave and skip_propagation_subtree to allow
>> skipping appropriate parts of the mount propagation tree without
>> needing to change the logic of the rest of the code.
>> 
>> A script to generate overlapping mount propagation trees:
>> 
>> $ cat runs.h
>> set -e
>> mount -t tmpfs zdtm /mnt
>> mkdir -p /mnt/1 /mnt/2
>> mount -t tmpfs zdtm /mnt/1
>> mount --make-shared /mnt/1
>> mkdir /mnt/1/1
>> 
>> iteration=10
>> if [ -n "$1" ] ; then
>> 	iteration=$1
>> fi
>> 
>> for i in $(seq $iteration); do
>> 	mount --bind /mnt/1/1 /mnt/1/1
>> done
>> 
>> mount --rbind /mnt/1 /mnt/2
>> 
>> TIMEFORMAT='%Rs'
>> nr=$(( ( 2 ** ( $iteration + 1 ) ) + 1 ))
>> echo -n "umount -l /mnt/1 -> $nr        "
>> time umount -l /mnt/1
>> 
>> nr=$(cat /proc/self/mountinfo | grep zdtm | wc -l )
>> time umount -l /mnt/2
>> 
>> $ for i in $(seq 9 19); do echo $i; unshare -Urm bash ./run.sh $i; done
>> 
>> Here are the performance numbers with and without the patch:
>> 
>>      mhash |  8192   |  8192  | 1048576 | 1048576
>>     mounts | before  | after  |  before | after
>>     ------------------------------------------------
>>       1025 |  0.040s | 0.016s |  0.038s | 0.019s
>>       2049 |  0.094s | 0.017s |  0.080s | 0.018s
>>       4097 |  0.243s | 0.019s |  0.206s | 0.023s
>>       8193 |  1.202s | 0.028s |  1.562s | 0.032s
>>      16385 |  9.635s | 0.036s |  9.952s | 0.041s
>>      32769 | 60.928s | 0.063s | 44.321s | 0.064s
>>      65537 |         | 0.097s |         | 0.097s
>>     131073 |         | 0.233s |         | 0.176s
>>     262145 |         | 0.653s |         | 0.344s
>>     524289 |         | 2.305s |         | 0.735s
>>    1048577 |         | 7.107s |         | 2.603s
>> 
>> Andrei Vagin reports fixing the performance problem is part of the
>> work to fix CVE-2016-6213.
>> 
>> Cc: stable@xxxxxxxxxxxxxxx
>> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
>> Reported-by: Andrei Vagin <avagin@xxxxxxxxxx>
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
>> ---
>>  fs/pnode.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 62 insertions(+), 1 deletion(-)
>> 
>> diff --git a/fs/pnode.c b/fs/pnode.c
>> index fbaca7df2eb0..53d411a371ce 100644
>> --- a/fs/pnode.c
>> +++ b/fs/pnode.c
>> @@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p)
>>  	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
>>  }
>>  
>> +static inline struct mount *last_slave(struct mount *p)
>> +{
>> +	return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
>> +}
>> +
>>  static inline struct mount *next_slave(struct mount *p)
>>  {
>>  	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
>> @@ -162,6 +167,19 @@ static struct mount *propagation_next(struct mount *m,
>>  	}
>>  }
>>  
>> +static struct mount *skip_propagation_subtree(struct mount *m,
>> +						struct mount *origin)
>> +{
>> +	/*
>> +	 * Advance m such that propagation_next will not return
>> +	 * the slaves of m.
>> +	 */
>> +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
>> +		m = last_slave(m);
>> +
>> +	return m;
>> +}
>> +
>>  static struct mount *next_group(struct mount *m, struct mount *origin)
>>  {
>>  	while (1) {
>> @@ -505,6 +523,15 @@ static void restore_mounts(struct list_head *to_restore)
>>  	}
>>  }
>>  
>> +static void cleanup_umount_visitations(struct list_head *visited)
>> +{
>> +	while (!list_empty(visited)) {
>> +		struct mount *mnt =
>> +			list_first_entry(visited, struct mount, mnt_umounting);
>> +		list_del_init(&mnt->mnt_umounting);
>> +	}
>> +}
>> +
>>  /*
>>   * collect all mounts that receive propagation from the mount in @list,
>>   * and return these additional mounts in the same list.
>> @@ -517,11 +544,23 @@ int propagate_umount(struct list_head *list)
>>  	struct mount *mnt;
>>  	LIST_HEAD(to_restore);
>>  	LIST_HEAD(to_umount);
>> +	LIST_HEAD(visited);
>>  
>> -	list_for_each_entry(mnt, list, mnt_list) {
>> +	/* Find candidates for unmounting */
>> +	list_for_each_entry_reverse(mnt, list, mnt_list) {
>>  		struct mount *parent = mnt->mnt_parent;
>>  		struct mount *m;
>>  
>> +		/*
>> +		 * If this mount has already been visited it is known that it's
>> +		 * entire peer group and all of their slaves in the propagation
>> +		 * tree for the mountpoint has already been visited and there is
>> +		 * no need to visit them again.
>> +		 */
>> +		if (!list_empty(&mnt->mnt_umounting))
>> +			continue;
>> +
>> +		list_add_tail(&mnt->mnt_umounting, &visited);
>>  		for (m = propagation_next(parent, parent); m;
>>  		     m = propagation_next(m, parent)) {
>>  			struct mount *child = __lookup_mnt(&m->mnt,
>> @@ -529,6 +568,27 @@ int propagate_umount(struct list_head *list)
>>  			if (!child)
>>  				continue;
>>  
>> +			if (!list_empty(&child->mnt_umounting)) {
>> +				/*
>> +				 * If the child has already been visited it is
>> +				 * know that it's entire peer group and all of
>> +				 * their slaves in the propgation tree for the
>> +				 * mountpoint has already been visited and there
>> +				 * is no need to visit this subtree again.
>> +				 */
>> +				m = skip_propagation_subtree(m, parent);
>> +				continue;
>> +			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
>> +				/*
>> +				 * We have come accross an partially unmounted
>> +				 * mount in list that has not been visited yet.
>> +				 * Remember it has been visited and continue
>> +				 * about our merry way.
>> +				 */
>> +				list_add_tail(&child->mnt_umounting, &visited);
>> +				continue;
>> +			}
>> +
>>  			/* Check the child and parents while progress is made */
>>  			while (__propagate_umount(child,
>>  						  &to_umount, &to_restore)) {
>> @@ -542,6 +602,7 @@ int propagate_umount(struct list_head *list)
>>  
>>  	umount_list(&to_umount, &to_restore);
>>  	restore_mounts(&to_restore);
>> +	cleanup_umount_visitations(&visited);
>>  	list_splice_tail(&to_umount, list);
>>  
>>  	return 0;
>> -- 
>> 2.10.1
>>