Re: [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once

Andrei Vagin <avagin@xxxxxxxxxxxxx> · Thu, 13 Oct 2016 14:46:51 -0700

On Thu, Oct 13, 2016 at 02:53:46PM -0500, Eric W. Biederman wrote:
> 
> Adrei Vagin pointed out that time to executue propagate_umount can go
> non-linear (and take a ludicrious amount of time) when the mount
> propogation trees of the mounts to be unmunted by a lazy unmount
> overlap.
> 
> Solve this in the most straight forward way possible, by adding a new
> mount flag to mark parts of the mount propagation tree that have been
> visited, and use that mark to skip parts of the mount propagation tree
> that have already been visited during an unmount.  This guarantees
> that each mountpoint in the possibly overlapping mount propagation
> trees will be visited exactly once.
> 
> Add the functions propagation_visit_next and propagation_revisit_next
> to coordinate setting and clearling the visited mount mark.
> 
> Here is a script to generate such mount tree:
> $ cat run.sh
> mount -t tmpfs test-mount /mnt
> mount --make-shared /mnt
> for i in `seq $1`; do
>         mkdir /mnt/test.$i
>         mount --bind /mnt /mnt/test.$i
> done
> cat /proc/mounts | grep test-mount | wc -l
> time umount -l /mnt
> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
> 
> Here are the performance numbers with and without the patch:
> 
> mounts | before | after (real sec)
> -----------------------------
>   1024 |  0.071 | 0.024
>   2048 |  0.184 | 0.030
>   4096 |  0.604 | 0.040
>   8912 |  4.471 | 0.043
>  16384 | 34.826 | 0.082
>  32768 |        | 0.151
>  65536 |        | 0.289
> 131072 |        | 0.659
> 
> Andrei Vagin fixing this performance problem is part of the
> work to fix CVE-2016-6213.
> 
> Cc: stable@xxxxxxxxxxxxxxx
> Reported-by: Andrei Vagin <avagin@xxxxxxxxxx>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
> ---
> 
> Andrei can you take a look at this patch and see if you can see any
> problems.  My limited testing suggests this approach does a much better
> job of solving the problem you were seeing.  With the time looking
> almost linear in the number of mounts now.

I read this patch and I like the idea.

Then I run my tests and one of them doesn't work with this patch.
I haven't found a reason yet.

Here is the test:

[root@fc24 mounts]# cat run.sh
set -e
mount -t tmpfs zdtm /mnt
mkdir -p /mnt/1 /mnt/2
mount -t tmpfs zdtm /mnt/1
mount --make-shared /mnt/1
for i in `seq $1`; do
	mount --bind /mnt/1 `mktemp -d /mnt/1/test.XXXXXX`
done
mount --rbind /mnt/1 /mnt/2
cat /proc/self/mountinfo | grep zdtm | wc -l
time umount -l /mnt/1
cat /proc/self/mountinfo | grep zdtm | wc -l
umount /mnt/2

[root@fc24 mounts]# unshare -Urm ./run.sh  5
65

real	0m0.014s
user	0m0.000s
sys	0m0.004s
33
umount: /mnt/2: target is busy
        (In some cases useful info about processes that
         use the device is found by lsof(8) or fuser(1).)

> 
>  fs/pnode.c            | 125 ++++++++++++++++++++++++++++++++++++++++++++++++--
>  fs/pnode.h            |   4 ++
>  include/linux/mount.h |   2 +
>  3 files changed, 126 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..3acce0c75f94 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -164,6 +164,120 @@ static struct mount *propagation_next(struct mount *m,
>  	}
>  }
>  
> +/*
> + * get the next mount in the propagation tree (that has not been visited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
> + */
> +static struct mount *propagation_visit_next(struct mount *m,
> +					    struct mount *origin)
> +{
> +	/* Has this part of the propgation tree already been visited? */
> +	if (IS_MNT_VISITED(m))
> +		return NULL;
> +
> +	SET_MNT_VISITED(m);
> +
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		struct mount *slave = first_slave(m);
> +		while (1) {
> +			if (!IS_MNT_VISITED(slave))
> +				return slave;
> +			if (slave->mnt_slave.next == &m->mnt_slave_list)
> +				break;
> +			slave = next_slave(slave);
> +		}
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				if (!IS_MNT_VISITED(next))
> +					return next;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +				if (!IS_MNT_VISITED(m))
> +					return m;
> +			}
> +		}
> +
> +		/* back at master */
> +		m = master;
> +	}
> +}
> +
> +/*
> + * get the next mount in the propagation tree (that has not been revisited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
> + */
> +static struct mount *propagation_revisit_next(struct mount *m,
> +					      struct mount *origin)
> +{
> +	/* Has this part of the propgation tree already been revisited? */
> +	if (!IS_MNT_VISITED(m))
> +		return NULL;
> +
> +	CLEAR_MNT_VISITED(m);
> +
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		struct mount *slave = first_slave(m);
> +		while (1) {
> +			if (IS_MNT_VISITED(slave))
> +				return slave;
> +			if (slave->mnt_slave.next == &m->mnt_slave_list)
> +				break;
> +			slave = next_slave(slave);
> +		}
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				if (IS_MNT_VISITED(next))
> +					return next;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +				if (IS_MNT_VISITED(m))
> +					return m;
> +			}
> +		}
> +
> +		/* back at master */
> +		m = master;
> +	}
> +}
> +
>  static struct mount *next_group(struct mount *m, struct mount *origin)
>  {
>  	while (1) {
> @@ -399,11 +513,12 @@ static void mark_umount_candidates(struct mount *mnt)
>  
>  	BUG_ON(parent == mnt);
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> +	for (m = propagation_visit_next(parent, parent); m;
> +			m = propagation_visit_next(m, parent)) {
>  		struct mount *child = __lookup_mnt_last(&m->mnt,
>  						mnt->mnt_mountpoint);
> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> +		if (child && (!IS_MNT_LOCKED(child) ||
> +			      IS_MNT_MARKED(m))) {
>  			SET_MNT_MARK(child);
>  		}
>  	}
> @@ -420,8 +535,8 @@ static void __propagate_umount(struct mount *mnt)
>  
>  	BUG_ON(parent == mnt);
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> +	for (m = propagation_revisit_next(parent, parent); m;
> +			m = propagation_revisit_next(m, parent)) {
>  
>  		struct mount *child = __lookup_mnt_last(&m->mnt,
>  						mnt->mnt_mountpoint);
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 550f5a8b4fcf..988ea4945764 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -21,6 +21,10 @@
>  #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
>  #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
>  
> +#define IS_MNT_VISITED(m) ((m)->mnt.mnt_flags & MNT_VISITED)
> +#define SET_MNT_VISITED(m) ((m)->mnt.mnt_flags |= MNT_VISITED)
> +#define CLEAR_MNT_VISITED(m) ((m)->mnt.mnt_flags &= ~MNT_VISITED)
> +
>  #define CL_EXPIRE    		0x01
>  #define CL_SLAVE     		0x02
>  #define CL_COPY_UNBINDABLE	0x04
> diff --git a/include/linux/mount.h b/include/linux/mount.h
> index 9227b190fdf2..6048045b96c3 100644
> --- a/include/linux/mount.h
> +++ b/include/linux/mount.h
> @@ -52,6 +52,8 @@ struct mnt_namespace;
>  
>  #define MNT_INTERNAL	0x4000
>  
> +#define MNT_VISITED		0x010000
> +
>  #define MNT_LOCK_ATIME		0x040000
>  #define MNT_LOCK_NOEXEC		0x080000
>  #define MNT_LOCK_NOSUID		0x100000
> -- 
> 2.8.3
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html