On 14.03.2020 12:11, Bernd Edlinger wrote: > The cred_guard_mutex is problematic. The cred_guard_mutex is held > over the userspace accesses as the arguments from userspace are read. > The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other > threads are killed. The cred_guard_mutex is held over > "put_user(0, tsk->clear_child_tid)" in exit_mm(). > > Any of those can result in deadlock, as the cred_guard_mutex is held > over a possible indefinite userspace waits for userspace. > > Add exec_update_mutex that is only held over exec updating process > with the new contents of exec, so that code that needs not to be > confused by exec changing the mm and the cred in ways that can not > happen during ordinary execution of a process. > > The plan is to switch the users of cred_guard_mutex to > exec_udpate_mutex one by one. This lets us move forward while still > being careful and not introducing any regressions. > > Link: https://lore.kernel.org/lkml/20160921152946.GA24210@xxxxxxxxxxxxxx/ > Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/ > Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@xxxxxxxxxx/ > Link: https://lore.kernel.org/lkml/20160923095031.GA14923@xxxxxxxxxx/ > Link: https://lore.kernel.org/lkml/20170213141452.GA30203@xxxxxxxxxx/ > Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.") > Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2") > Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> > Signed-off-by: Bernd Edlinger <bernd.edlinger@xxxxxxxxxx> > --- > fs/exec.c | 17 ++++++++++++++--- > include/linux/binfmts.h | 8 +++++++- > include/linux/sched/signal.h | 9 ++++++++- > init/init_task.c | 1 + > kernel/fork.c | 1 + > 5 files changed, 31 insertions(+), 5 deletions(-) > > v3: this update fixes lock-order and adds an explicit data member in linux_binprm > > diff --git a/fs/exec.c b/fs/exec.c > index d820a72..11974a1 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -1014,12 +1014,17 @@ static int exec_mmap(struct mm_struct *mm) > { > struct task_struct *tsk; > struct mm_struct *old_mm, *active_mm; > + int ret; > > /* Notify parent that we're no longer interested in the old VM */ > tsk = current; > old_mm = current->mm; > exec_mm_release(tsk, old_mm); > > + ret = mutex_lock_killable(&tsk->signal->exec_update_mutex); > + if (ret) > + return ret; > + > if (old_mm) { > sync_mm_rss(old_mm); > /* > @@ -1031,9 +1036,11 @@ static int exec_mmap(struct mm_struct *mm) > down_read(&old_mm->mmap_sem); > if (unlikely(old_mm->core_state)) { > up_read(&old_mm->mmap_sem); > + mutex_unlock(&tsk->signal->exec_update_mutex); > return -EINTR; > } > } > + > task_lock(tsk); > active_mm = tsk->active_mm; > membarrier_exec_mmap(mm); > @@ -1288,11 +1295,12 @@ int flush_old_exec(struct linux_binprm * bprm) > goto out; > > /* > - * After clearing bprm->mm (to mark that current is using the > - * prepared mm now), we have nothing left of the original > + * After setting bprm->called_exec_mmap (to mark that current is > + * using the prepared mm now), we have nothing left of the original > * process. If anything from here on returns an error, the check > * in search_binary_handler() will SEGV current. > */ > + bprm->called_exec_mmap = 1; The two below is non-breaking pair: exec_mmap(bprm->mm); bprm->called_exec_mmap = 1; Why not move this into exec_mmap(), so nobody definitely inserts something between them? > bprm->mm = NULL; > > #ifdef CONFIG_POSIX_TIMERS > @@ -1438,6 +1446,8 @@ static void free_bprm(struct linux_binprm *bprm) > { > free_arg_pages(bprm); > if (bprm->cred) { > + if (bprm->called_exec_mmap) > + mutex_unlock(¤t->signal->exec_update_mutex); > mutex_unlock(¤t->signal->cred_guard_mutex); > abort_creds(bprm->cred); > } > @@ -1487,6 +1497,7 @@ void install_exec_creds(struct linux_binprm *bprm) > * credentials; any time after this it may be unlocked. > */ > security_bprm_committed_creds(bprm); > + mutex_unlock(¤t->signal->exec_update_mutex); > mutex_unlock(¤t->signal->cred_guard_mutex); > } > EXPORT_SYMBOL(install_exec_creds); > @@ -1678,7 +1689,7 @@ int search_binary_handler(struct linux_binprm *bprm) > > read_lock(&binfmt_lock); > put_binfmt(fmt); > - if (retval < 0 && !bprm->mm) { > + if (retval < 0 && bprm->called_exec_mmap) { > /* we got to flush_old_exec() and failed after it */ > read_unlock(&binfmt_lock); > force_sigsegv(SIGSEGV); > diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h > index b40fc63..a345d9f 100644 > --- a/include/linux/binfmts.h > +++ b/include/linux/binfmts.h > @@ -44,7 +44,13 @@ struct linux_binprm { > * exec has happened. Used to sanitize execution environment > * and to set AT_SECURE auxv for glibc. > */ > - secureexec:1; > + secureexec:1, > + /* > + * Set by flush_old_exec, when exec_mmap has been called. > + * This is past the point of no return, when the > + * exec_update_mutex has been taken. > + */ > + called_exec_mmap:1; > #ifdef __alpha__ > unsigned int taso:1; > #endif > diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h > index 8805025..a29df79 100644 > --- a/include/linux/sched/signal.h > +++ b/include/linux/sched/signal.h > @@ -224,7 +224,14 @@ struct signal_struct { > > struct mutex cred_guard_mutex; /* guard against foreign influences on > * credential calculations > - * (notably. ptrace) */ > + * (notably. ptrace) > + * Deprecated do not use in new code. > + * Use exec_update_mutex instead. > + */ > + struct mutex exec_update_mutex; /* Held while task_struct is being > + * updated during exec, and may have > + * inconsistent permissions. > + */ > } __randomize_layout; > > /* > diff --git a/init/init_task.c b/init/init_task.c > index 9e5cbe5..bd403ed 100644 > --- a/init/init_task.c > +++ b/init/init_task.c > @@ -26,6 +26,7 @@ > .multiprocess = HLIST_HEAD_INIT, > .rlim = INIT_RLIMITS, > .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), > + .exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex), > #ifdef CONFIG_POSIX_TIMERS > .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), > .cputimer = { > diff --git a/kernel/fork.c b/kernel/fork.c > index 8642530..036b692 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) > sig->oom_score_adj_min = current->signal->oom_score_adj_min; > > mutex_init(&sig->cred_guard_mutex); > + mutex_init(&sig->exec_update_mutex); > > return 0; > } >