[PATCH 1/2] userfaultfd: use RCU to free the task struct when fork fails

Andrea Arcangeli <aarcange@xxxxxxxxxx> · Mon, 25 Mar 2019 18:56:35 -0400

MEMCG depends on the task structure not to be freed under
rcu_read_lock() in get_mem_cgroup_from_mm() after it dereferences
mm->owner.

An alternate possible fix would be to defer the delivery of the
userfaultfd contexts to the monitor until after fork() is guaranteed
to succeed. Such a change would require more changes because it would
create a strict ordering dependency where the uffd methods would need
to be called beyond the last potentially failing branch in order to be
safe. This solution as opposed only adds the dependency to common code
to set mm->owner to NULL and to free the task struct that was pointed
by mm->owner with RCU, if fork ends up failing. The userfaultfd
methods can still be called anywhere during the fork runtime and the
monitor will keep discarding orphaned "mm" coming from failed forks in
userland.

This race condition couldn't trigger if CONFIG_MEMCG was set =n at
build time.

Fixes: 893e26e61d04 ("userfaultfd: non-cooperative: Add fork() event")
Cc: stable@xxxxxxxxxx
Tested-by: zhong jiang <zhongjiang@xxxxxxxxxx>
Reported-by: syzbot+cbb52e396df3e565ab02@xxxxxxxxxxxxxxxxxxxxxxxxx
Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
---
 kernel/fork.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa210b..a19790e27afd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -952,6 +952,15 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
 
+static __always_inline void mm_clear_owner(struct mm_struct *mm,
+					   struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+	if (mm->owner == p)
+		WRITE_ONCE(mm->owner, NULL);
+#endif
+}
+
 static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 {
 #ifdef CONFIG_MEMCG
@@ -1331,6 +1340,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
 free_pt:
 	/* don't put binfmt in mmput, we haven't got module yet */
 	mm->binfmt = NULL;
+	mm_init_owner(mm, NULL);
 	mmput(mm);
 
 fail_nomem:
@@ -1662,6 +1672,24 @@ static inline void rcu_copy_process(struct task_struct *p)
 #endif /* #ifdef CONFIG_TASKS_RCU */
 }
 
+#ifdef CONFIG_MEMCG
+static void __delayed_free_task(struct rcu_head *rhp)
+{
+	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+	free_task(tsk);
+}
+#endif /* CONFIG_MEMCG */
+
+static __always_inline void delayed_free_task(struct task_struct *tsk)
+{
+#ifdef CONFIG_MEMCG
+	call_rcu(&tsk->rcu, __delayed_free_task);
+#else /* CONFIG_MEMCG */
+	free_task(tsk);
+#endif /* CONFIG_MEMCG */
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -2123,8 +2151,10 @@ static __latent_entropy struct task_struct *copy_process(
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-	if (p->mm)
+	if (p->mm) {
+		mm_clear_owner(p->mm, p);
 		mmput(p->mm);
+	}
 bad_fork_cleanup_signal:
 	if (!(clone_flags & CLONE_THREAD))
 		free_signal_struct(p->signal);
@@ -2155,7 +2185,7 @@ static __latent_entropy struct task_struct *copy_process(
 bad_fork_free:
 	p->state = TASK_DEAD;
 	put_task_stack(p);
-	free_task(p);
+	delayed_free_task(p);
 fork_out:
 	spin_lock_irq(&current->sighand->siglock);
 	hlist_del_init(&delayed.node);