__ptrace_may_access() checks can happen on target tasks that are in the middle of do_exit(), past exit_mm(). At that point, the ->mm pointer has been NULLed out, and the mm_struct has been mmput(). Unfortunately, the mm_struct contains the dumpability and the user_ns in which the task last went through execve(), and we need those for __ptrace_may_access(). Currently, that problem is handled by failing open: If the ->mm is gone, we assume that the task was dumpable. In some edge cases, this could potentially expose access to things like /proc/$pid/fd/$fd of originally non-dumpable processes. (exit_files() comes after exit_mm(), so the file descriptor table is still there when we've gone through exit_mm().) One way to fix this would be to move mm->user_ns and the dumpability state over into the task_struct. However, that gets quite ugly if we want to preserve existing semantics because e.g. PR_SET_DUMPABLE and commit_creds() would then have to scan through all tasks sharing the mm_struct and keep them in sync manually - that'd be a bit error-prone and overcomplicated. (Moving these things into the signal_struct is not an option because that is kept across executions, and pre-execve co-threads will share the signal_struct that is also used by the task that has gone through execve().) I believe that this patch may be the least bad option to fix this - keep the mm_struct (but not process memory) around with an mmgrab() reference from exit_mm() until the task goes away completely. Note that this moves free_task() down in order to make mmdrop_async() available without a forward declaration. Cc: stable@xxxxxxxxxxxxxxx Fixes: bfedb589252c ("mm: Add a user_ns owner to mm_struct and fix ptrace permission checks") Signed-off-by: Jann Horn <jannh@xxxxxxxxxx> --- include/linux/sched.h | 8 +++++++ kernel/exit.c | 2 ++ kernel/fork.c | 54 ++++++++++++++++++++++--------------------- kernel/ptrace.c | 10 ++++++++ 4 files changed, 48 insertions(+), 26 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index afe01e232935..55bec6ff5626 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -747,6 +747,14 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; + /* + * When we exit and ->mm (the reference pinning ->mm's address space) + * goes away, we stash a reference to the mm_struct itself (counted via + * exit_mm->mm_count) in this member. + * This allows us to continue using the mm_struct for security checks + * and such even after the task has started exiting. + */ + struct mm_struct *exit_mm; /* Per-thread vma caching: */ struct vmacache vmacache; diff --git a/kernel/exit.c b/kernel/exit.c index 733e80f334e7..97253ef33486 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -476,6 +476,8 @@ static void exit_mm(void) /* more a memory barrier than a real lock */ task_lock(current); current->mm = NULL; + mmgrab(mm); /* for current->exit_mm */ + current->exit_mm = mm; mmap_read_unlock(mm); enter_lazy_tlb(mm, current); task_unlock(current); diff --git a/kernel/fork.c b/kernel/fork.c index da8d360fb032..4942428a217c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -438,32 +438,6 @@ void put_task_stack(struct task_struct *tsk) } #endif -void free_task(struct task_struct *tsk) -{ - scs_release(tsk); - -#ifndef CONFIG_THREAD_INFO_IN_TASK - /* - * The task is finally done with both the stack and thread_info, - * so free both. - */ - release_task_stack(tsk); -#else - /* - * If the task had a separate stack allocation, it should be gone - * by now. - */ - WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); -#endif - rt_mutex_debug_task_free(tsk); - ftrace_graph_exit_task(tsk); - arch_release_task_struct(tsk); - if (tsk->flags & PF_KTHREAD) - free_kthread_struct(tsk); - free_task_struct(tsk); -} -EXPORT_SYMBOL(free_task); - #ifdef CONFIG_MMU static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) @@ -722,6 +696,34 @@ static inline void put_signal_struct(struct signal_struct *sig) free_signal_struct(sig); } +void free_task(struct task_struct *tsk) +{ + scs_release(tsk); + +#ifndef CONFIG_THREAD_INFO_IN_TASK + /* + * The task is finally done with both the stack and thread_info, + * so free both. + */ + release_task_stack(tsk); +#else + /* + * If the task had a separate stack allocation, it should be gone + * by now. + */ + WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); +#endif + rt_mutex_debug_task_free(tsk); + ftrace_graph_exit_task(tsk); + arch_release_task_struct(tsk); + if (tsk->flags & PF_KTHREAD) + free_kthread_struct(tsk); + if (tsk->exit_mm) + mmdrop_async(tsk->exit_mm); + free_task_struct(tsk); +} +EXPORT_SYMBOL(free_task); + void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 43d6179508d6..0aedc6cf5bdc 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -342,7 +342,17 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) * Pairs with a write barrier in commit_creds(). */ smp_rmb(); + /* + * Look up the target task's mm_struct. If it fails because the task is + * exiting and has gone through exit_mm(), we can instead use ->exit_mm + * as long as we only use members that are preserved by an mmgrab() + * reference. + * The only case in which both ->mm and ->exit_mm can be NULL should be + * kernel threads. + */ mm = task->mm; + if (!mm) + mm = task->exit_mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && !ptrace_has_cap(cred, mm->user_ns, mode))) -- 2.29.0.rc1.297.gfa9743e501-goog