On 3/17/20 9:56 AM, Kirill Tkhai wrote: > On 14.03.2020 12:11, Bernd Edlinger wrote: >> The cred_guard_mutex is problematic. The cred_guard_mutex is held >> over the userspace accesses as the arguments from userspace are read. >> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other >> threads are killed. The cred_guard_mutex is held over >> "put_user(0, tsk->clear_child_tid)" in exit_mm(). >> >> Any of those can result in deadlock, as the cred_guard_mutex is held >> over a possible indefinite userspace waits for userspace. >> >> Add exec_update_mutex that is only held over exec updating process >> with the new contents of exec, so that code that needs not to be >> confused by exec changing the mm and the cred in ways that can not >> happen during ordinary execution of a process. >> >> The plan is to switch the users of cred_guard_mutex to >> exec_udpate_mutex one by one. This lets us move forward while still >> being careful and not introducing any regressions. >> >> Link: https://lore.kernel.org/lkml/20160921152946.GA24210@xxxxxxxxxxxxxx/ >> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/ >> Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@xxxxxxxxxx/ >> Link: https://lore.kernel.org/lkml/20160923095031.GA14923@xxxxxxxxxx/ >> Link: https://lore.kernel.org/lkml/20170213141452.GA30203@xxxxxxxxxx/ >> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.") >> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2") >> Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> >> Signed-off-by: Bernd Edlinger <bernd.edlinger@xxxxxxxxxx> >> --- >> fs/exec.c | 17 ++++++++++++++--- >> include/linux/binfmts.h | 8 +++++++- >> include/linux/sched/signal.h | 9 ++++++++- >> init/init_task.c | 1 + >> kernel/fork.c | 1 + >> 5 files changed, 31 insertions(+), 5 deletions(-) >> >> v3: this update fixes lock-order and adds an explicit data member in linux_binprm >> >> diff --git a/fs/exec.c b/fs/exec.c >> index d820a72..11974a1 100644 >> --- a/fs/exec.c >> +++ b/fs/exec.c >> @@ -1014,12 +1014,17 @@ static int exec_mmap(struct mm_struct *mm) >> { >> struct task_struct *tsk; >> struct mm_struct *old_mm, *active_mm; >> + int ret; >> >> /* Notify parent that we're no longer interested in the old VM */ >> tsk = current; >> old_mm = current->mm; >> exec_mm_release(tsk, old_mm); >> >> + ret = mutex_lock_killable(&tsk->signal->exec_update_mutex); >> + if (ret) >> + return ret; >> + >> if (old_mm) { >> sync_mm_rss(old_mm); >> /* >> @@ -1031,9 +1036,11 @@ static int exec_mmap(struct mm_struct *mm) >> down_read(&old_mm->mmap_sem); >> if (unlikely(old_mm->core_state)) { >> up_read(&old_mm->mmap_sem); >> + mutex_unlock(&tsk->signal->exec_update_mutex); >> return -EINTR; >> } >> } >> + >> task_lock(tsk); >> active_mm = tsk->active_mm; >> membarrier_exec_mmap(mm); >> @@ -1288,11 +1295,12 @@ int flush_old_exec(struct linux_binprm * bprm) >> goto out; >> >> /* >> - * After clearing bprm->mm (to mark that current is using the >> - * prepared mm now), we have nothing left of the original >> + * After setting bprm->called_exec_mmap (to mark that current is >> + * using the prepared mm now), we have nothing left of the original >> * process. If anything from here on returns an error, the check >> * in search_binary_handler() will SEGV current. >> */ >> + bprm->called_exec_mmap = 1; > > The two below is non-breaking pair: > > exec_mmap(bprm->mm); > bprm->called_exec_mmap = 1; > > Why not move this into exec_mmap(), so nobody definitely inserts something > between them? > Hmm, could be done, but then I would probably need a different name than "called_exec_mmap". How about adding a nice function comment to exec_mmap that calls out the changed behaviour that the exec_update_mutex is taken unless the function fails? Bernd. >> bprm->mm = NULL; >> >> #ifdef CONFIG_POSIX_TIMERS >> @@ -1438,6 +1446,8 @@ static void free_bprm(struct linux_binprm *bprm) >> { >> free_arg_pages(bprm); >> if (bprm->cred) { >> + if (bprm->called_exec_mmap) >> + mutex_unlock(¤t->signal->exec_update_mutex); >> mutex_unlock(¤t->signal->cred_guard_mutex); >> abort_creds(bprm->cred); >> } >> @@ -1487,6 +1497,7 @@ void install_exec_creds(struct linux_binprm *bprm) >> * credentials; any time after this it may be unlocked. >> */ >> security_bprm_committed_creds(bprm); >> + mutex_unlock(¤t->signal->exec_update_mutex); >> mutex_unlock(¤t->signal->cred_guard_mutex); >> } >> EXPORT_SYMBOL(install_exec_creds); >> @@ -1678,7 +1689,7 @@ int search_binary_handler(struct linux_binprm *bprm) >> >> read_lock(&binfmt_lock); >> put_binfmt(fmt); >> - if (retval < 0 && !bprm->mm) { >> + if (retval < 0 && bprm->called_exec_mmap) { >> /* we got to flush_old_exec() and failed after it */ >> read_unlock(&binfmt_lock); >> force_sigsegv(SIGSEGV); >> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h >> index b40fc63..a345d9f 100644 >> --- a/include/linux/binfmts.h >> +++ b/include/linux/binfmts.h >> @@ -44,7 +44,13 @@ struct linux_binprm { >> * exec has happened. Used to sanitize execution environment >> * and to set AT_SECURE auxv for glibc. >> */ >> - secureexec:1; >> + secureexec:1, >> + /* >> + * Set by flush_old_exec, when exec_mmap has been called. >> + * This is past the point of no return, when the >> + * exec_update_mutex has been taken. >> + */ >> + called_exec_mmap:1; >> #ifdef __alpha__ >> unsigned int taso:1; >> #endif >> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h >> index 8805025..a29df79 100644 >> --- a/include/linux/sched/signal.h >> +++ b/include/linux/sched/signal.h >> @@ -224,7 +224,14 @@ struct signal_struct { >> >> struct mutex cred_guard_mutex; /* guard against foreign influences on >> * credential calculations >> - * (notably. ptrace) */ >> + * (notably. ptrace) >> + * Deprecated do not use in new code. >> + * Use exec_update_mutex instead. >> + */ >> + struct mutex exec_update_mutex; /* Held while task_struct is being >> + * updated during exec, and may have >> + * inconsistent permissions. >> + */ >> } __randomize_layout; >> >> /* >> diff --git a/init/init_task.c b/init/init_task.c >> index 9e5cbe5..bd403ed 100644 >> --- a/init/init_task.c >> +++ b/init/init_task.c >> @@ -26,6 +26,7 @@ >> .multiprocess = HLIST_HEAD_INIT, >> .rlim = INIT_RLIMITS, >> .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), >> + .exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex), >> #ifdef CONFIG_POSIX_TIMERS >> .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), >> .cputimer = { >> diff --git a/kernel/fork.c b/kernel/fork.c >> index 8642530..036b692 100644 >> --- a/kernel/fork.c >> +++ b/kernel/fork.c >> @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) >> sig->oom_score_adj_min = current->signal->oom_score_adj_min; >> >> mutex_init(&sig->cred_guard_mutex); >> + mutex_init(&sig->exec_update_mutex); >> >> return 0; >> } >> >