[PATCH RESENT] fs, ipc: Use an asynchronous version of kern_unmount in IPC

Salman Qazi <sqazi@xxxxxxxxxx> · Mon, 4 Mar 2019 11:48:59 -0800

Prior to this patch, the kernel can spend a lot of time with
this stack trace:

[<ffffffffbe5491e3>] __wait_rcu_gp+0x93/0xe0
[<ffffffffbe549418>] synchronize_sched+0x48/0x60
[<ffffffffbe7ae5b3>] kern_unmount+0x3a/0x46
[<ffffffffbe847c02>] mq_put_mnt+0x15/0x17
[<ffffffffbe8481af>] put_ipc_ns+0x36/0x8b

This patch solves the issue by removing synchronize_rcu from mq_put_mnt.
This is done by implementing an asynchronous version of kern_unmount.

Since mntput() sleeps, it needs to be deferred to a work queue.

Additionally, the callers of mq_put_mnt appear to be safe having
it behave asynchronously.  In particular, put_ipc_ns calls
mq_clear_sbinfo which renders the inode inaccessible for the purposes of
mqueue_create by making s_fs_info NULL.  This appears
to be the thing that prevents access while free_ipc_ns is taking place.
So, the unmount should be able to proceed lazily.

Tested: Ran the following program:

    int main(void)
    {
            int pid;
            int status;
            int i;

            for (i = 0; i < 1000; i++) {
                    pid = fork();
                    if (!pid) {
                            assert(!unshare(CLONE_NEWUSER|
                                      CLONE_NEWIPC|CLONE_NEWNS));
                            return 0;
                    }

                    assert(waitpid(pid, &status, 0) == pid);
            }
    }

Before:

$ time ./unshare2

real    0m9.784s
user    0m0.428s
sys     0m0.000s

After:

$ time ./unshare2

real    0m0.368s
user    0m0.226s
sys     0m0.122s

Signed-off-by: Salman Qazi <sqazi@xxxxxxxxxx>
Reviewed-by: Eric Dumazet <edumazet@xxxxxxxxxx>
---
 fs/namespace.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  1 +
 ipc/mqueue.c       |  2 +-
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 678ef175d63a..e60b473c3bbc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3321,6 +3321,47 @@ void kern_unmount(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(kern_unmount);
 
+struct async_unmount_cb {
+	struct vfsmount *mnt;
+	struct work_struct work;
+	struct rcu_head rcu_head;
+};
+
+static void kern_unmount_work(struct work_struct *work)
+{
+	struct async_unmount_cb *cb = container_of(work,
+			struct async_unmount_cb, work);
+
+	mntput(cb->mnt);
+	kfree(cb);
+}
+
+static void kern_unmount_rcu_cb(struct rcu_head *rcu_head)
+{
+	struct async_unmount_cb *cb = container_of(rcu_head,
+			struct async_unmount_cb, rcu_head);
+
+	INIT_WORK(&cb->work, kern_unmount_work);
+	schedule_work(&cb->work);
+
+}
+
+void kern_unmount_async(struct vfsmount *mnt)
+{
+	/* release long term mount so mount point can be released */
+	if (!IS_ERR_OR_NULL(mnt)) {
+		struct async_unmount_cb *cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+
+		if (cb) {
+			real_mount(mnt)->mnt_ns = NULL;
+			cb->mnt = mnt;
+			call_rcu(&cb->rcu_head, kern_unmount_rcu_cb);
+		} else {
+			kern_unmount(mnt);
+		}
+	}
+}
+
 bool our_mnt(struct vfsmount *mnt)
 {
 	return check_mnt(real_mount(mnt));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29d8e2cfed0e..8865997a8722 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2274,6 +2274,7 @@ extern int register_filesystem(struct file_system_type *);
 extern int unregister_filesystem(struct file_system_type *);
 extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
 #define kern_mount(type) kern_mount_data(type, NULL)
+extern void kern_unmount_async(struct vfsmount *mnt);
 extern void kern_unmount(struct vfsmount *mnt);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c595bed7bfcb..a8c2465ac0cb 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1554,7 +1554,7 @@ void mq_clear_sbinfo(struct ipc_namespace *ns)
 
 void mq_put_mnt(struct ipc_namespace *ns)
 {
-	kern_unmount(ns->mq_mnt);
+	kern_unmount_async(ns->mq_mnt);
 }
 
 static int __init init_mqueue_fs(void)
-- 
2.21.0.352.gf09ad66450-goog