Prior to this patch, the kernel can spend a lot of time with this stack trace: [<ffffffffbe5491e3>] __wait_rcu_gp+0x93/0xe0 [<ffffffffbe549418>] synchronize_sched+0x48/0x60 [<ffffffffbe7ae5b3>] kern_unmount+0x3a/0x46 [<ffffffffbe847c02>] mq_put_mnt+0x15/0x17 [<ffffffffbe8481af>] put_ipc_ns+0x36/0x8b This patch solves the issue by removing synchronize_rcu from mq_put_mnt. This is done by implementing an asynchronous version of kern_unmount. Since mntput() sleeps, it needs to be deferred to a work queue. Additionally, the callers of mq_put_mnt appear to be safe having it behave asynchronously. In particular, put_ipc_ns calls mq_clear_sbinfo which renders the inode inaccessible for the purposes of mqueue_create by making s_fs_info NULL. This appears to be the thing that prevents access while free_ipc_ns is taking place. So, the unmount should be able to proceed lazily. Tested: Ran the following program: int main(void) { int pid; int status; int i; for (i = 0; i < 1000; i++) { pid = fork(); if (!pid) { assert(!unshare(CLONE_NEWUSER| CLONE_NEWIPC|CLONE_NEWNS)); return 0; } assert(waitpid(pid, &status, 0) == pid); } } Before: $ time ./unshare2 real 0m9.784s user 0m0.428s sys 0m0.000s After: $ time ./unshare2 real 0m0.368s user 0m0.226s sys 0m0.122s Signed-off-by: Salman Qazi <sqazi@xxxxxxxxxx> Reviewed-by: Eric Dumazet <edumazet@xxxxxxxxxx> --- fs/namespace.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + ipc/mqueue.c | 2 +- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 678ef175d63a..e60b473c3bbc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3321,6 +3321,47 @@ void kern_unmount(struct vfsmount *mnt) } EXPORT_SYMBOL(kern_unmount); +struct async_unmount_cb { + struct vfsmount *mnt; + struct work_struct work; + struct rcu_head rcu_head; +}; + +static void kern_unmount_work(struct work_struct *work) +{ + struct async_unmount_cb *cb = container_of(work, + struct async_unmount_cb, work); + + mntput(cb->mnt); + kfree(cb); +} + +static void kern_unmount_rcu_cb(struct rcu_head *rcu_head) +{ + struct async_unmount_cb *cb = container_of(rcu_head, + struct async_unmount_cb, rcu_head); + + INIT_WORK(&cb->work, kern_unmount_work); + schedule_work(&cb->work); + +} + +void kern_unmount_async(struct vfsmount *mnt) +{ + /* release long term mount so mount point can be released */ + if (!IS_ERR_OR_NULL(mnt)) { + struct async_unmount_cb *cb = kmalloc(sizeof(*cb), GFP_KERNEL); + + if (cb) { + real_mount(mnt)->mnt_ns = NULL; + cb->mnt = mnt; + call_rcu(&cb->rcu_head, kern_unmount_rcu_cb); + } else { + kern_unmount(mnt); + } + } +} + bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 29d8e2cfed0e..8865997a8722 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2274,6 +2274,7 @@ extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); #define kern_mount(type) kern_mount_data(type, NULL) +extern void kern_unmount_async(struct vfsmount *mnt); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c595bed7bfcb..a8c2465ac0cb 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1554,7 +1554,7 @@ void mq_clear_sbinfo(struct ipc_namespace *ns) void mq_put_mnt(struct ipc_namespace *ns) { - kern_unmount(ns->mq_mnt); + kern_unmount_async(ns->mq_mnt); } static int __init init_mqueue_fs(void) -- 2.21.0.352.gf09ad66450-goog