On Wed, Feb 6, 2019 at 11:54 AM Salman Qazi <sqazi@xxxxxxxxxx> wrote: > > Prior to this patch, the kernel can spend a lot of time with > this stack trace: > > [<ffffffffbe5491e3>] __wait_rcu_gp+0x93/0xe0 > [<ffffffffbe549418>] synchronize_sched+0x48/0x60 > [<ffffffffbe7ae5b3>] kern_unmount+0x3a/0x46 > [<ffffffffbe847c02>] mq_put_mnt+0x15/0x17 > [<ffffffffbe8481af>] put_ipc_ns+0x36/0x8b > > This patch solves the issue by removing synchronize_rcu from mq_put_mnt. > This is done by implementing an asynchronous version of kern_unmount. > > Since mntput() sleeps, it needs to be deferred to a work queue. > > Additionally, the callers of mq_put_mnt appear to be safe having > it behave asynchronously. In particular, put_ipc_ns calls > mq_clear_sbinfo which renders the inode inaccessible for the purposes of > mqueue_create by making s_fs_info NULL. This appears > to be the thing that prevents access while free_ipc_ns is taking place. > So, the unmount should be able to proceed lazily. > > Tested: Ran the following program: > > int main(void) > { > int pid; > int status; > int i; > > for (i = 0; i < 1000; i++) { > pid = fork(); > if (!pid) { > assert(!unshare(CLONE_NEWUSER| > CLONE_NEWIPC|CLONE_NEWNS)); > return 0; > } > > assert(waitpid(pid, &status, 0) == pid); > } > } > > Before: > > $ time ./unshare2 > > real 0m9.784s > user 0m0.428s > sys 0m0.000s > > After: > > $ time ./unshare2 > > real 0m0.368s > user 0m0.226s > sys 0m0.122s > > Signed-off-by: Salman Qazi <sqazi@xxxxxxxxxx> Reviewed-by: Eric Dumazet <edumazet@xxxxxxxxxx> > --- > fs/namespace.c | 41 +++++++++++++++++++++++++++++++++++++++++ > include/linux/fs.h | 1 + > ipc/mqueue.c | 2 +- > 3 files changed, 43 insertions(+), 1 deletion(-) > > diff --git a/fs/namespace.c b/fs/namespace.c > index a677b59efd74..caa51ca81605 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -3323,6 +3323,47 @@ void kern_unmount(struct vfsmount *mnt) > } > EXPORT_SYMBOL(kern_unmount); > > +struct async_unmount_cb { > + struct vfsmount *mnt; > + struct work_struct work; > + struct rcu_head rcu_head; > +}; > + > +static void kern_unmount_work(struct work_struct *work) > +{ > + struct async_unmount_cb *cb = container_of(work, > + struct async_unmount_cb, work); > + > + mntput(cb->mnt); > + kfree(cb); > +} > + > +static void kern_unmount_rcu_cb(struct rcu_head *rcu_head) > +{ > + struct async_unmount_cb *cb = container_of(rcu_head, > + struct async_unmount_cb, rcu_head); > + > + INIT_WORK(&cb->work, kern_unmount_work); > + schedule_work(&cb->work); > + > +} > + > +void kern_unmount_async(struct vfsmount *mnt) > +{ > + /* release long term mount so mount point can be released */ > + if (!IS_ERR_OR_NULL(mnt)) { > + struct async_unmount_cb *cb = kmalloc(sizeof(*cb), GFP_KERNEL); > + > + if (cb) { > + real_mount(mnt)->mnt_ns = NULL; > + cb->mnt = mnt; > + call_rcu(&cb->rcu_head, kern_unmount_rcu_cb); > + } else { > + kern_unmount(mnt); > + } > + } > +} > + > bool our_mnt(struct vfsmount *mnt) > { > return check_mnt(real_mount(mnt)); > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 29d8e2cfed0e..8865997a8722 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -2274,6 +2274,7 @@ extern int register_filesystem(struct file_system_type *); > extern int unregister_filesystem(struct file_system_type *); > extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); > #define kern_mount(type) kern_mount_data(type, NULL) > +extern void kern_unmount_async(struct vfsmount *mnt); > extern void kern_unmount(struct vfsmount *mnt); > extern int may_umount_tree(struct vfsmount *); > extern int may_umount(struct vfsmount *); > diff --git a/ipc/mqueue.c b/ipc/mqueue.c > index c595bed7bfcb..a8c2465ac0cb 100644 > --- a/ipc/mqueue.c > +++ b/ipc/mqueue.c > @@ -1554,7 +1554,7 @@ void mq_clear_sbinfo(struct ipc_namespace *ns) > > void mq_put_mnt(struct ipc_namespace *ns) > { > - kern_unmount(ns->mq_mnt); > + kern_unmount_async(ns->mq_mnt); > } > > static int __init init_mqueue_fs(void) > -- > 2.20.1.611.gfbb209baf1-goog >