On 5/28/21 7:20 PM, Vitaly Chikunov wrote: > Hi, > > Possible ptrace related bug in tools/testing/selftests/ptrace/vmaccess.c - > it timeout every time I run it on any kernel I try, in vm or bare metal. > > Does it run correctly for anybody? > No, that is an open issue, I wrote the test case in anticipation that all of my patch series got accepted, but the last patch was not picked up for inclusion in the linux kernel. Well, actually I still have the patch, see attached patch file. This makes the test case pass. The idea is that PTRACE_ATTACH returns EAGAIN in this case, and the tracer calls waitpid if that return value is received, which makes the zombie process die, and then PTRACE_ATTACH will succeed. Eric wanted to propose a different solution though. Eric, have you meanwhile been able to come up with a better solution? Thanks, Bernd. > Example vmaccess from v5.12 source tree run on 5.13.0+rc2 on bare metal > Supermicro server (AMD EPYC): > > 5.10.35-rt-alt1.rt39:~# /usr/lib/kselftests/ptrace/vmaccess > TAP version 13 > 1..2 > # Starting 2 tests from 1 test cases. > # RUN global.vmaccess ... > # OK global.vmaccess > ok 1 global.vmaccess > # RUN global.attach ... > # attach: Test terminated by timeout > # FAIL global.attach > not ok 2 global.attach > # FAILED: 1 / 2 tests passed. > # Totals: pass:1 fail:1 xfail:0 xpass:0 skip:0 error:0 > > Just to confirm for the latest kernel, it behaves the same [for drm-tip > based] on 5.13rc2. Also, just tested on 5.11.21 with the same failure. > > Other ptrace tests pass. > > Thanks, >
From c5afacba607c38f4c9d9bf106a071f9737ba1479 Mon Sep 17 00:00:00 2001 From: Bernd Edlinger <bernd.edlinger@xxxxxxxxxx> Date: Tue, 10 Mar 2020 13:20:57 +0100 Subject: [PATCH 1/2] exec: Fix dead-lock in de_thread with ptrace_attach This removes the last users of cred_guard_mutex and replaces it with a new mutex exec_guard_mutex, and a boolean unsafe_execve_in_progress. This addresses the case when at least one of the sibling threads is traced, and therefore the trace process may dead-lock in ptrace_attach, but de_thread will need to wait for the tracer to continue execution. The solution is to detect this situation and make ptrace_attach and similar functions return -EAGAIN, but only in a situation where a dead-lock is imminent. This means this is an API change, but only when the process is traced while execve happens in a multi-threaded application. See tools/testing/selftests/ptrace/vmaccess.c for a test case that gets fixed by this change. Signed-off-by: Bernd Edlinger <bernd.edlinger@xxxxxxxxxx> --- fs/exec.c | 42 ++++++++++++++++++++++++++++++++++-------- fs/proc/base.c | 20 ++++++++++++++++++-- include/linux/sched/signal.h | 13 ++++++++++--- init/init_task.c | 2 +- kernel/cred.c | 2 +- kernel/fork.c | 2 +- kernel/ptrace.c | 44 +++++++++++++++++++++++++++++++++++++++++--- kernel/seccomp.c | 25 +++++++++++++++++++------ 8 files changed, 125 insertions(+), 25 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 18594f1..6ab93c0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1040,14 +1040,26 @@ static int de_thread(struct task_struct *tsk) struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; + struct task_struct *t = tsk; if (thread_group_empty(tsk)) goto no_thread_group; + spin_lock_irq(lock); + while_each_thread(tsk, t) { + if (unlikely(t->ptrace)) + sig->unsafe_execve_in_progress = true; + } + + if (unlikely(sig->unsafe_execve_in_progress)) { + spin_unlock_irq(lock); + mutex_unlock(&sig->exec_guard_mutex); + spin_lock_irq(lock); + } + /* * Kill all other threads in the thread group. */ - spin_lock_irq(lock); if (signal_group_exit(sig)) { /* * Another group action in progress, just @@ -1438,7 +1450,10 @@ void setup_new_exec(struct linux_binprm * bprm) */ me->mm->task_size = TASK_SIZE; up_write(&me->signal->exec_update_lock); - mutex_unlock(&me->signal->cred_guard_mutex); + if (unlikely(me->signal->unsafe_execve_in_progress)) + mutex_lock(&me->signal->exec_guard_mutex); + current->signal->unsafe_execve_in_progress = false; + mutex_unlock(&me->signal->exec_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); @@ -1453,22 +1468,30 @@ void finalize_exec(struct linux_binprm *bprm) EXPORT_SYMBOL(finalize_exec); /* - * Prepare credentials and lock ->cred_guard_mutex. + * Prepare credentials and lock ->exec_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) { - if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) + int ret; + + if (mutex_lock_interruptible(¤t->signal->exec_guard_mutex)) return -ERESTARTNOINTR; + ret = -EBUSY; + if (unlikely(current->signal->unsafe_execve_in_progress)) + goto out; + bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; - mutex_unlock(¤t->signal->cred_guard_mutex); - return -ENOMEM; + ret = -ENOMEM; +out: + mutex_unlock(¤t->signal->exec_guard_mutex); + return ret; } static void free_bprm(struct linux_binprm *bprm) @@ -1479,7 +1502,10 @@ static void free_bprm(struct linux_binprm *bprm) } free_arg_pages(bprm); if (bprm->cred) { - mutex_unlock(¤t->signal->cred_guard_mutex); + if (unlikely(current->signal->unsafe_execve_in_progress)) + mutex_lock(¤t->signal->exec_guard_mutex); + current->signal->unsafe_execve_in_progress = false; + mutex_unlock(¤t->signal->exec_guard_mutex); abort_creds(bprm->cred); } if (bprm->file) { @@ -1542,7 +1568,7 @@ int bprm_change_interp(const char *interp, struct linux_binprm *bprm) /* * determine how safe it is to execute the proposed program - * - the caller must hold ->cred_guard_mutex to protect against + * - the caller must hold ->exec_guard_mutex to protect against * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) diff --git a/fs/proc/base.c b/fs/proc/base.c index 3851bfc..fe2833c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2735,14 +2735,30 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, } /* Guard against adverse ptrace interaction */ - rv = mutex_lock_interruptible(¤t->signal->cred_guard_mutex); + rv = mutex_lock_interruptible(¤t->signal->exec_guard_mutex); if (rv < 0) goto out_free; + /* + * BIG FAT WARNING - Fragile code ahead. + * Please do not insert any code between these two + * if statements. It may happen that execve has to + * release the exec_guard_mutex in order to prevent + * deadlocks. In that case unsafe_execve_in_progress + * will be set. If that happens you cannot assume that + * the usual guarantees implied by exec_guard_mutex + * are valid. Just return -EBUSY in that case and + * unlock the mutex immediately. + */ + rv = -EBUSY; + if (unlikely(current->signal->unsafe_execve_in_progress)) + goto out_unlock; + rv = security_setprocattr(PROC_I(inode)->op.lsm, file->f_path.dentry->d_name.name, page, count); - mutex_unlock(¤t->signal->cred_guard_mutex); +out_unlock: + mutex_unlock(¤t->signal->exec_guard_mutex); out_free: kfree(page); out: diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3f6a0fc..407545a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -214,6 +214,13 @@ struct signal_struct { #endif /* + * Set while execve is executing but is *not* holding + * exec_guard_mutex to avoid possible dead-locks. + * Only valid when exec_guard_mutex is held. + */ + bool unsafe_execve_in_progress; + + /* * Thread is the potential origin of an oom condition; kill first on * oom */ @@ -224,11 +231,11 @@ struct signal_struct { struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */ - struct mutex cred_guard_mutex; /* guard against foreign influences on + struct mutex exec_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) - * Deprecated do not use in new code. - * Use exec_update_lock instead. + * Held while execve runs, except when + * a sibling thread is being traced. */ struct rw_semaphore exec_update_lock; /* Held while task_struct is * being updated during exec, diff --git a/init/init_task.c b/init/init_task.c index 3711cda..b917349 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -25,7 +25,7 @@ }, .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, - .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), + .exec_guard_mutex = __MUTEX_INITIALIZER(init_signals.exec_guard_mutex), .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), diff --git a/kernel/cred.c b/kernel/cred.c index 421b114..f408024 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -295,7 +295,7 @@ struct cred *prepare_creds(void) /* * Prepare credentials for current to perform an execve() - * - The caller must hold ->cred_guard_mutex + * - The caller must hold ->exec_guard_mutex */ struct cred *prepare_exec_creds(void) { diff --git a/kernel/fork.c b/kernel/fork.c index 426cd0c..5602226 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1601,7 +1601,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; - mutex_init(&sig->cred_guard_mutex); + mutex_init(&sig->exec_guard_mutex); init_rwsem(&sig->exec_update_lock); return 0; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 61db50f..e052586 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -386,9 +386,26 @@ static int ptrace_attach(struct task_struct *task, long request, * under ptrace. */ retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) + if (mutex_lock_interruptible(&task->signal->exec_guard_mutex)) goto out; + /* + * BIG FAT WARNING - Fragile code ahead. + * Please do not insert any code between these two + * if statements. It may happen that execve has to + * release the exec_guard_mutex in order to prevent + * deadlocks. In that case unsafe_execve_in_progress + * will be set. If that happens you cannot assume that + * the usual guarantees implied by exec_guard_mutex + * are valid. Just return -EAGAIN in that case and + * unlock the mutex immediately. + * The tracer is expected to call wait(2) and handle + * possible events before calling this API again. + */ + retval = -EAGAIN; + if (unlikely(task->signal->unsafe_execve_in_progress)) + goto unlock_creds; + task_lock(task); retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); task_unlock(task); @@ -441,7 +458,7 @@ static int ptrace_attach(struct task_struct *task, long request, unlock_tasklist: write_unlock_irq(&tasklist_lock); unlock_creds: - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_guard_mutex); out: if (!retval) { /* @@ -466,10 +483,29 @@ static int ptrace_attach(struct task_struct *task, long request, */ static int ptrace_traceme(void) { - int ret = -EPERM; + int ret; + + if (mutex_lock_interruptible(¤t->signal->exec_guard_mutex)) + return -ERESTARTNOINTR; + + /* + * BIG FAT WARNING - Fragile code ahead. + * Please do not insert any code between these two + * if statements. It may happen that execve has to + * release the exec_guard_mutex in order to prevent + * deadlocks. In that case unsafe_execve_in_progress + * will be set. If that happens you cannot assume that + * the usual guarantees implied by exec_guard_mutex + * are valid. Just return -EAGAIN in that case and + * unlock the mutex immediately. + */ + ret = -EAGAIN; + if (unlikely(current->signal->unsafe_execve_in_progress)) + goto unlock_creds; write_lock_irq(&tasklist_lock); /* Are we already being traced? */ + ret = -EPERM; if (!current->ptrace) { ret = security_ptrace_traceme(current->parent); /* @@ -484,6 +520,8 @@ static int ptrace_traceme(void) } write_unlock_irq(&tasklist_lock); +unlock_creds: + mutex_unlock(¤t->signal->exec_guard_mutex); return ret; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1d60fc2..9292b25 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -466,7 +466,7 @@ static int is_ancestor(struct seccomp_filter *parent, /** * seccomp_can_sync_threads: checks if all threads can be synchronized * - * Expects sighand and cred_guard_mutex locks to be held. + * Expects sighand and exec_guard_mutex locks to be held. * * Returns 0 on success, -ve on error, or the pid of a thread which was * either not in the correct seccomp mode or did not have an ancestral @@ -476,9 +476,22 @@ static inline pid_t seccomp_can_sync_threads(void) { struct task_struct *thread, *caller; - BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + BUG_ON(!mutex_is_locked(¤t->signal->exec_guard_mutex)); assert_spin_locked(¤t->sighand->siglock); + /* + * BIG FAT WARNING - Fragile code ahead. + * It may happen that execve has to release the + * exec_guard_mutex in order to prevent deadlocks. + * In that case unsafe_execve_in_progress will be set. + * If that happens you cannot assume that the usual + * guarantees implied by exec_guard_mutex are valid. + * Just return -EAGAIN in that case and unlock the mutex + * immediately. + */ + if (unlikely(current->signal->unsafe_execve_in_progress)) + return -EAGAIN; + /* Validate all threads being eligible for synchronization. */ caller = current; for_each_thread(caller, thread) { @@ -564,7 +577,7 @@ void seccomp_filter_release(struct task_struct *tsk) /** * seccomp_sync_threads: sets all threads to use current's filter * - * Expects sighand and cred_guard_mutex locks to be held, and for + * Expects sighand and exec_guard_mutex locks to be held, and for * seccomp_can_sync_threads() to have returned success already * without dropping the locks. * @@ -573,7 +586,7 @@ static inline void seccomp_sync_threads(unsigned long flags) { struct task_struct *thread, *caller; - BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + BUG_ON(!mutex_is_locked(¤t->signal->exec_guard_mutex)); assert_spin_locked(¤t->sighand->siglock); /* Synchronize all threads. */ @@ -1825,7 +1838,7 @@ static long seccomp_set_mode_filter(unsigned int flags, * while another thread is in the middle of calling exec. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC && - mutex_lock_killable(¤t->signal->cred_guard_mutex)) + mutex_lock_killable(¤t->signal->exec_guard_mutex)) goto out_put_fd; spin_lock_irq(¤t->sighand->siglock); @@ -1848,7 +1861,7 @@ static long seccomp_set_mode_filter(unsigned int flags, out: spin_unlock_irq(¤t->sighand->siglock); if (flags & SECCOMP_FILTER_FLAG_TSYNC) - mutex_unlock(¤t->signal->cred_guard_mutex); + mutex_unlock(¤t->signal->exec_guard_mutex); out_put_fd: if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { if (ret) { -- 1.9.1