On 26.03.2019 11:07, Kirill Tkhai wrote: > On 26.03.2019 01:56, Andrea Arcangeli wrote: >> MEMCG depends on the task structure not to be freed under >> rcu_read_lock() in get_mem_cgroup_from_mm() after it dereferences >> mm->owner. >> >> An alternate possible fix would be to defer the delivery of the >> userfaultfd contexts to the monitor until after fork() is guaranteed >> to succeed. Such a change would require more changes because it would >> create a strict ordering dependency where the uffd methods would need >> to be called beyond the last potentially failing branch in order to be >> safe. This solution as opposed only adds the dependency to common code >> to set mm->owner to NULL and to free the task struct that was pointed >> by mm->owner with RCU, if fork ends up failing. The userfaultfd >> methods can still be called anywhere during the fork runtime and the >> monitor will keep discarding orphaned "mm" coming from failed forks in >> userland. >> >> This race condition couldn't trigger if CONFIG_MEMCG was set =n at >> build time. >> >> Fixes: 893e26e61d04 ("userfaultfd: non-cooperative: Add fork() event") >> Cc: stable@xxxxxxxxxx >> Tested-by: zhong jiang <zhongjiang@xxxxxxxxxx> >> Reported-by: syzbot+cbb52e396df3e565ab02@xxxxxxxxxxxxxxxxxxxxxxxxx >> Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> >> --- >> kernel/fork.c | 34 ++++++++++++++++++++++++++++++++-- >> 1 file changed, 32 insertions(+), 2 deletions(-) >> >> diff --git a/kernel/fork.c b/kernel/fork.c >> index 9dcd18aa210b..a19790e27afd 100644 >> --- a/kernel/fork.c >> +++ b/kernel/fork.c >> @@ -952,6 +952,15 @@ static void mm_init_aio(struct mm_struct *mm) >> #endif >> } >> >> +static __always_inline void mm_clear_owner(struct mm_struct *mm, >> + struct task_struct *p) >> +{ >> +#ifdef CONFIG_MEMCG >> + if (mm->owner == p) >> + WRITE_ONCE(mm->owner, NULL); >> +#endif >> +} >> + >> static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) >> { >> #ifdef CONFIG_MEMCG >> @@ -1331,6 +1340,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) >> free_pt: >> /* don't put binfmt in mmput, we haven't got module yet */ >> mm->binfmt = NULL; >> + mm_init_owner(mm, NULL); >> mmput(mm); >> >> fail_nomem: >> @@ -1662,6 +1672,24 @@ static inline void rcu_copy_process(struct task_struct *p) >> #endif /* #ifdef CONFIG_TASKS_RCU */ >> } >> >> +#ifdef CONFIG_MEMCG >> +static void __delayed_free_task(struct rcu_head *rhp) >> +{ >> + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); >> + >> + free_task(tsk); >> +} >> +#endif /* CONFIG_MEMCG */ >> + >> +static __always_inline void delayed_free_task(struct task_struct *tsk) >> +{ >> +#ifdef CONFIG_MEMCG >> + call_rcu(&tsk->rcu, __delayed_free_task); >> +#else /* CONFIG_MEMCG */ >> + free_task(tsk); >> +#endif /* CONFIG_MEMCG */ >> +} >> + >> /* >> * This creates a new process as a copy of the old one, >> * but does not actually start it yet. >> @@ -2123,8 +2151,10 @@ static __latent_entropy struct task_struct *copy_process( >> bad_fork_cleanup_namespaces: >> exit_task_namespaces(p); >> bad_fork_cleanup_mm: >> - if (p->mm) >> + if (p->mm) { >> + mm_clear_owner(p->mm, p); >> mmput(p->mm); >> + } >> bad_fork_cleanup_signal: >> if (!(clone_flags & CLONE_THREAD)) >> free_signal_struct(p->signal); >> @@ -2155,7 +2185,7 @@ static __latent_entropy struct task_struct *copy_process( >> bad_fork_free: >> p->state = TASK_DEAD; >> put_task_stack(p); >> - free_task(p); >> + delayed_free_task(p); > > Can't call_rcu(&p->rcu, delayed_put_task_struct) be used instead this? I mean: refcount_set(&tsk->usage, 2); call_rcu(&p->rcu, delayed_put_task_struct); And: diff --git a/kernel/fork.c b/kernel/fork.c index 3c516c6f7ce4..27cdf61b51a1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -715,7 +715,9 @@ static inline void put_signal_struct(struct signal_struct *sig) void __put_task_struct(struct task_struct *tsk) { - WARN_ON(!tsk->exit_state); + if (!tsk->exit_state) + /* Cleanup of copy_process() */ + goto free; WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); @@ -727,6 +729,7 @@ void __put_task_struct(struct task_struct *tsk) put_signal_struct(tsk->signal); if (!profile_handoff_task(tsk)) +free: free_task(tsk); } EXPORT_SYMBOL_GPL(__put_task_struct);