[RFC PATCH resend 1/6] ptrace: Keep mm around after exit_mm() for __ptrace_may_access()

Jann Horn <jannh@xxxxxxxxxx> · Sat, 17 Oct 2020 01:09:10 +0200

__ptrace_may_access() checks can happen on target tasks that are in the
middle of do_exit(), past exit_mm(). At that point, the ->mm pointer has
been NULLed out, and the mm_struct has been mmput().

Unfortunately, the mm_struct contains the dumpability and the user_ns in
which the task last went through execve(), and we need those for
__ptrace_may_access(). Currently, that problem is handled by failing open:
If the ->mm is gone, we assume that the task was dumpable. In some edge
cases, this could potentially expose access to things like
/proc/$pid/fd/$fd of originally non-dumpable processes.
(exit_files() comes after exit_mm(), so the file descriptor table is still
there when we've gone through exit_mm().)

One way to fix this would be to move mm->user_ns and the dumpability state
over into the task_struct. However, that gets quite ugly if we want to
preserve existing semantics because e.g. PR_SET_DUMPABLE and commit_creds()
would then have to scan through all tasks sharing the mm_struct and keep
them in sync manually - that'd be a bit error-prone and overcomplicated.

(Moving these things into the signal_struct is not an option because that
is kept across executions, and pre-execve co-threads will share the
signal_struct that is also used by the task that has gone through
execve().)

I believe that this patch may be the least bad option to fix this - keep
the mm_struct (but not process memory) around with an mmgrab() reference
from exit_mm() until the task goes away completely.

Note that this moves free_task() down in order to make mmdrop_async()
available without a forward declaration.

Cc: stable@xxxxxxxxxxxxxxx
Fixes: bfedb589252c ("mm: Add a user_ns owner to mm_struct and fix ptrace permission checks")
Signed-off-by: Jann Horn <jannh@xxxxxxxxxx>
---
 include/linux/sched.h |  8 +++++++
 kernel/exit.c         |  2 ++
 kernel/fork.c         | 54 ++++++++++++++++++++++---------------------
 kernel/ptrace.c       | 10 ++++++++
 4 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index afe01e232935..55bec6ff5626 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -747,6 +747,14 @@ struct task_struct {
 
 	struct mm_struct		*mm;
 	struct mm_struct		*active_mm;
+	/*
+	 * When we exit and ->mm (the reference pinning ->mm's address space)
+	 * goes away, we stash a reference to the mm_struct itself (counted via
+	 * exit_mm->mm_count) in this member.
+	 * This allows us to continue using the mm_struct for security checks
+	 * and such even after the task has started exiting.
+	 */
+	struct mm_struct		*exit_mm;
 
 	/* Per-thread vma caching: */
 	struct vmacache			vmacache;
diff --git a/kernel/exit.c b/kernel/exit.c
index 733e80f334e7..97253ef33486 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -476,6 +476,8 @@ static void exit_mm(void)
 	/* more a memory barrier than a real lock */
 	task_lock(current);
 	current->mm = NULL;
+	mmgrab(mm); /* for current->exit_mm */
+	current->exit_mm = mm;
 	mmap_read_unlock(mm);
 	enter_lazy_tlb(mm, current);
 	task_unlock(current);
diff --git a/kernel/fork.c b/kernel/fork.c
index da8d360fb032..4942428a217c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -438,32 +438,6 @@ void put_task_stack(struct task_struct *tsk)
 }
 #endif
 
-void free_task(struct task_struct *tsk)
-{
-	scs_release(tsk);
-
-#ifndef CONFIG_THREAD_INFO_IN_TASK
-	/*
-	 * The task is finally done with both the stack and thread_info,
-	 * so free both.
-	 */
-	release_task_stack(tsk);
-#else
-	/*
-	 * If the task had a separate stack allocation, it should be gone
-	 * by now.
-	 */
-	WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
-#endif
-	rt_mutex_debug_task_free(tsk);
-	ftrace_graph_exit_task(tsk);
-	arch_release_task_struct(tsk);
-	if (tsk->flags & PF_KTHREAD)
-		free_kthread_struct(tsk);
-	free_task_struct(tsk);
-}
-EXPORT_SYMBOL(free_task);
-
 #ifdef CONFIG_MMU
 static __latent_entropy int dup_mmap(struct mm_struct *mm,
 					struct mm_struct *oldmm)
@@ -722,6 +696,34 @@ static inline void put_signal_struct(struct signal_struct *sig)
 		free_signal_struct(sig);
 }
 
+void free_task(struct task_struct *tsk)
+{
+	scs_release(tsk);
+
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+	/*
+	 * The task is finally done with both the stack and thread_info,
+	 * so free both.
+	 */
+	release_task_stack(tsk);
+#else
+	/*
+	 * If the task had a separate stack allocation, it should be gone
+	 * by now.
+	 */
+	WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
+#endif
+	rt_mutex_debug_task_free(tsk);
+	ftrace_graph_exit_task(tsk);
+	arch_release_task_struct(tsk);
+	if (tsk->flags & PF_KTHREAD)
+		free_kthread_struct(tsk);
+	if (tsk->exit_mm)
+		mmdrop_async(tsk->exit_mm);
+	free_task_struct(tsk);
+}
+EXPORT_SYMBOL(free_task);
+
 void __put_task_struct(struct task_struct *tsk)
 {
 	WARN_ON(!tsk->exit_state);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 43d6179508d6..0aedc6cf5bdc 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -342,7 +342,17 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 * Pairs with a write barrier in commit_creds().
 	 */
 	smp_rmb();
+	/*
+	 * Look up the target task's mm_struct. If it fails because the task is
+	 * exiting and has gone through exit_mm(), we can instead use ->exit_mm
+	 * as long as we only use members that are preserved by an mmgrab()
+	 * reference.
+	 * The only case in which both ->mm and ->exit_mm can be NULL should be
+	 * kernel threads.
+	 */
 	mm = task->mm;
+	if (!mm)
+		mm = task->exit_mm;
 	if (mm &&
 	    ((get_dumpable(mm) != SUID_DUMP_USER) &&
 	     !ptrace_has_cap(cred, mm->user_ns, mode)))
-- 
2.29.0.rc1.297.gfa9743e501-goog