Re: [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Feb 6, 2019 at 11:54 AM Salman Qazi <sqazi@xxxxxxxxxx> wrote:
>
> Prior to this patch, the kernel can spend a lot of time with
> this stack trace:
>
> [<ffffffffbe5491e3>] __wait_rcu_gp+0x93/0xe0
> [<ffffffffbe549418>] synchronize_sched+0x48/0x60
> [<ffffffffbe7ae5b3>] kern_unmount+0x3a/0x46
> [<ffffffffbe847c02>] mq_put_mnt+0x15/0x17
> [<ffffffffbe8481af>] put_ipc_ns+0x36/0x8b
>
> This patch solves the issue by removing synchronize_rcu from mq_put_mnt.
> This is done by implementing an asynchronous version of kern_unmount.
>
> Since mntput() sleeps, it needs to be deferred to a work queue.
>
> Additionally, the callers of mq_put_mnt appear to be safe having
> it behave asynchronously.  In particular, put_ipc_ns calls
> mq_clear_sbinfo which renders the inode inaccessible for the purposes of
> mqueue_create by making s_fs_info NULL.  This appears
> to be the thing that prevents access while free_ipc_ns is taking place.
> So, the unmount should be able to proceed lazily.
>
> Tested: Ran the following program:
>
>     int main(void)
>     {
>             int pid;
>             int status;
>             int i;
>
>             for (i = 0; i < 1000; i++) {
>                     pid = fork();
>                     if (!pid) {
>                             assert(!unshare(CLONE_NEWUSER|
>                                       CLONE_NEWIPC|CLONE_NEWNS));
>                             return 0;
>                     }
>
>                     assert(waitpid(pid, &status, 0) == pid);
>             }
>     }
>
> Before:
>
> $ time ./unshare2
>
> real    0m9.784s
> user    0m0.428s
> sys     0m0.000s
>
> After:
>
> $ time ./unshare2
>
> real    0m0.368s
> user    0m0.226s
> sys     0m0.122s
>
> Signed-off-by: Salman Qazi <sqazi@xxxxxxxxxx>

Reviewed-by: Eric Dumazet <edumazet@xxxxxxxxxx>

> ---
>  fs/namespace.c     | 41 +++++++++++++++++++++++++++++++++++++++++
>  include/linux/fs.h |  1 +
>  ipc/mqueue.c       |  2 +-
>  3 files changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index a677b59efd74..caa51ca81605 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -3323,6 +3323,47 @@ void kern_unmount(struct vfsmount *mnt)
>  }
>  EXPORT_SYMBOL(kern_unmount);
>
> +struct async_unmount_cb {
> +       struct vfsmount *mnt;
> +       struct work_struct work;
> +       struct rcu_head rcu_head;
> +};
> +
> +static void kern_unmount_work(struct work_struct *work)
> +{
> +       struct async_unmount_cb *cb = container_of(work,
> +                       struct async_unmount_cb, work);
> +
> +       mntput(cb->mnt);
> +       kfree(cb);
> +}
> +
> +static void kern_unmount_rcu_cb(struct rcu_head *rcu_head)
> +{
> +       struct async_unmount_cb *cb = container_of(rcu_head,
> +                       struct async_unmount_cb, rcu_head);
> +
> +       INIT_WORK(&cb->work, kern_unmount_work);
> +       schedule_work(&cb->work);
> +
> +}
> +
> +void kern_unmount_async(struct vfsmount *mnt)
> +{
> +       /* release long term mount so mount point can be released */
> +       if (!IS_ERR_OR_NULL(mnt)) {
> +               struct async_unmount_cb *cb = kmalloc(sizeof(*cb), GFP_KERNEL);
> +
> +               if (cb) {
> +                       real_mount(mnt)->mnt_ns = NULL;
> +                       cb->mnt = mnt;
> +                       call_rcu(&cb->rcu_head, kern_unmount_rcu_cb);
> +               } else {
> +                       kern_unmount(mnt);
> +               }
> +       }
> +}
> +
>  bool our_mnt(struct vfsmount *mnt)
>  {
>         return check_mnt(real_mount(mnt));
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 29d8e2cfed0e..8865997a8722 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2274,6 +2274,7 @@ extern int register_filesystem(struct file_system_type *);
>  extern int unregister_filesystem(struct file_system_type *);
>  extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
>  #define kern_mount(type) kern_mount_data(type, NULL)
> +extern void kern_unmount_async(struct vfsmount *mnt);
>  extern void kern_unmount(struct vfsmount *mnt);
>  extern int may_umount_tree(struct vfsmount *);
>  extern int may_umount(struct vfsmount *);
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index c595bed7bfcb..a8c2465ac0cb 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -1554,7 +1554,7 @@ void mq_clear_sbinfo(struct ipc_namespace *ns)
>
>  void mq_put_mnt(struct ipc_namespace *ns)
>  {
> -       kern_unmount(ns->mq_mnt);
> +       kern_unmount_async(ns->mq_mnt);
>  }
>
>  static int __init init_mqueue_fs(void)
> --
> 2.20.1.611.gfbb209baf1-goog
>



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux