I found that a few processes can eat all host memory and nobody can kill them. $ mount -t tmpfs xxx /mnt $ mount --make-shared /mnt $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XXXXXX` & done All this processes are unkillable, because they took i_mutex and waits namespace_lock. ... 21715 pts/0 D 0:00 \_ mount --bind /mnt /mnt/test.ht6jzO 21716 pts/0 D 0:00 \_ mount --bind /mnt /mnt/test.97K4mI 21717 pts/0 R 0:01 \_ mount --bind /mnt /mnt/test.gO2CD9 ... Each of this process doubles a number of mounts, so at the end we will have about 2^32 mounts and the size of struct mnt is 256 bytes, so we need about 1TB of RAM. Another problem is that “umount” of a big tree is very hard operation and it requires a lot of time. E.g.: 16411 umount("/tmp/xxx", MNT_DETACH) = 0 <7.852066> (7.8 sec) 32795 umount("/tmp/xxx", MNT_DETACH) = 0 <34.485501> ( 34 sec) For all this time sys_umoun takes namespace_sem and vfsmount_lock... Due to all this reasons I suggest to restrict a number of mounts. Probably we can optimize this code in a future, but now this restriction can help. Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx> Cc: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> Cc: "Serge E. Hallyn" <serge@xxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: Kees Cook <keescook@xxxxxxxxxxxx> Cc: Mel Gorman <mgorman@xxxxxxx> Cc: Rik van Riel <riel@xxxxxxxxxx> Signed-off-by: Andrey Vagin <avagin@xxxxxxxxxx> --- fs/namespace.c | 66 +++++++++++++++++++++++++------------------ include/linux/mnt_namespace.h | 2 ++ kernel/sysctl.c | 8 ++++++ 3 files changed, 49 insertions(+), 27 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 7b1ca9b..d22e54c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -41,6 +41,9 @@ static struct list_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; static struct rw_semaphore namespace_sem; +unsigned int sysctl_mount_nr __read_mostly = 16384; +static atomic_t mount_nr = ATOMIC_INIT(0); + /* /sys/fs */ struct kobject *fs_kobj; EXPORT_SYMBOL_GPL(fs_kobj); @@ -164,43 +167,49 @@ unsigned int mnt_get_count(struct mount *mnt) static struct mount *alloc_vfsmnt(const char *name) { - struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); - if (mnt) { - int err; + struct mount *mnt; + int err; - err = mnt_alloc_id(mnt); - if (err) - goto out_free_cache; + if (atomic_inc_return(&mount_nr) > sysctl_mount_nr) + goto out_dec_mount_nr; - if (name) { - mnt->mnt_devname = kstrdup(name, GFP_KERNEL); - if (!mnt->mnt_devname) - goto out_free_id; - } + mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); + if (!mnt) + goto out_dec_mount_nr; + + err = mnt_alloc_id(mnt); + if (err) + goto out_free_cache; + + if (name) { + mnt->mnt_devname = kstrdup(name, GFP_KERNEL); + if (!mnt->mnt_devname) + goto out_free_id; + } #ifdef CONFIG_SMP - mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); - if (!mnt->mnt_pcp) - goto out_free_devname; + mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); + if (!mnt->mnt_pcp) + goto out_free_devname; - this_cpu_add(mnt->mnt_pcp->mnt_count, 1); + this_cpu_add(mnt->mnt_pcp->mnt_count, 1); #else - mnt->mnt_count = 1; - mnt->mnt_writers = 0; + mnt->mnt_count = 1; + mnt->mnt_writers = 0; #endif - INIT_LIST_HEAD(&mnt->mnt_hash); - INIT_LIST_HEAD(&mnt->mnt_child); - INIT_LIST_HEAD(&mnt->mnt_mounts); - INIT_LIST_HEAD(&mnt->mnt_list); - INIT_LIST_HEAD(&mnt->mnt_expire); - INIT_LIST_HEAD(&mnt->mnt_share); - INIT_LIST_HEAD(&mnt->mnt_slave_list); - INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_LIST_HEAD(&mnt->mnt_hash); + INIT_LIST_HEAD(&mnt->mnt_child); + INIT_LIST_HEAD(&mnt->mnt_mounts); + INIT_LIST_HEAD(&mnt->mnt_list); + INIT_LIST_HEAD(&mnt->mnt_expire); + INIT_LIST_HEAD(&mnt->mnt_share); + INIT_LIST_HEAD(&mnt->mnt_slave_list); + INIT_LIST_HEAD(&mnt->mnt_slave); #ifdef CONFIG_FSNOTIFY - INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); + INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif - } + return mnt; #ifdef CONFIG_SMP @@ -211,6 +220,8 @@ out_free_id: mnt_free_id(mnt); out_free_cache: kmem_cache_free(mnt_cache, mnt); +out_dec_mount_nr: + atomic_dec(&mount_nr); return NULL; } @@ -546,6 +557,7 @@ static void free_vfsmnt(struct mount *mnt) #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); #endif + atomic_dec(&mount_nr); kmem_cache_free(mnt_cache, mnt); } diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 12b2ab5..d8e5ec9 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -2,6 +2,8 @@ #define _NAMESPACE_H_ #ifdef __KERNEL__ +extern unsigned int sysctl_mount_nr; + struct mnt_namespace; struct fs_struct; struct user_namespace; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf45..bebfdd7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,7 @@ #include <linux/kmod.h> #include <linux/capability.h> #include <linux/binfmts.h> +#include <linux/mnt_namespace.h> #include <linux/sched/sysctl.h> #include <asm/uaccess.h> @@ -1616,6 +1617,13 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, + { + .procname = "mount-nr", + .data = &sysctl_mount_nr, + .maxlen = sizeof(sysctl_mount_nr), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html