Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> writes: > On Mon, Jun 21, 2021 at 1:04 PM Eric W. Biederman <ebiederm@xxxxxxxxxxxx> wrote: >> >> For other ptrace_event calls I am playing with seeing if I can split >> them in two. Like sending a signal. So that we can have perform all >> of the work in get_signal. > > That sounds like the right model, but I don't think it works. > Particularly not for exit(). The second phase will never happen. Playing with it some more I think I have everything working working except for PTRACE_EVENT_SECCOMP (which can stay ptrace_event) and group_exit(2). Basically in exit sending yourself a signal and then calling do_exit from the signal handler is not unreasonable, as exit is an ordinary system call. I haven't seen anything that ``knows'' that exit(2) or exit_group(2) will never return and adds a special case in the system call table for that case. The complications of exit_group(2) are roughly those of moving ptrace_event out of do_exit. They look doable and I am going to look at that next. This is not to say that this is the most maintainable way or that we necessarily want to implement things this way, but I need to look and see what it looks like. For purposes of discussion this is my current draft implementation. diff --git a/include/linux/sched.h b/include/linux/sched.h index d2c881384517..891812d32b90 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1087,6 +1087,7 @@ struct task_struct { struct capture_control *capture_control; #endif /* Ptrace state: */ + int stop_code; unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index b5ebf6c01292..33c50119b193 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -164,18 +164,29 @@ static inline void ptrace_event(int event, unsigned long message) } } +static inline bool ptrace_post_event(int event, unsigned long message) +{ + bool posted = false; + if (unlikely(ptrace_event_enabled(current, event))) { + current->ptrace_message = message; + current->stop_code = (event << 8) | SIGTRAP; + set_tsk_thread_flag(current, TIF_SIGPENDING); + posted = true; + } else if (event == PTRACE_EVENT_EXEC) { + /* legacy EXEC report via SIGTRAP */ + if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED) + send_sig(SIGTRAP, current, 0); + } + return posted; +} + /** - * ptrace_event_pid - possibly stop for a ptrace event notification - * @event: %PTRACE_EVENT_* value to report - * @pid: process identifier for %PTRACE_GETEVENTMSG to return - * - * Check whether @event is enabled and, if so, report @event and @pid - * to the ptrace parent. @pid is reported as the pid_t seen from the - * ptrace parent's pid namespace. + * pid_parent_nr - Return the number the parent knows this pid as + * @pid: The struct pid whose numerical value we want * * Called without locks. */ -static inline void ptrace_event_pid(int event, struct pid *pid) +static inline pid_t pid_parent_nr(struct pid *pid) { /* * FIXME: There's a potential race if a ptracer in a different pid @@ -183,16 +194,15 @@ static inline void ptrace_event_pid(int event, struct pid *pid) * when we acquire tasklist_lock in ptrace_stop(). If this happens, * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG. */ - unsigned long message = 0; + pid_t nr = 0; struct pid_namespace *ns; rcu_read_lock(); ns = task_active_pid_ns(rcu_dereference(current->parent)); if (ns) - message = pid_nr_ns(pid, ns); + nr = pid_nr_ns(pid, ns); rcu_read_unlock(); - - ptrace_event(event, message); + return nr; } /** diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index e24b1fe348e3..a2eac3831369 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -97,6 +97,8 @@ extern void exit_mm_release(struct task_struct *, struct mm_struct *); /* Remove the current tasks stale references to the old mm_struct on exec() */ extern void exec_mm_release(struct task_struct *, struct mm_struct *); +extern int wait_for_vfork_done(struct task_struct *child, struct completion *vfork); + #ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); #else diff --git a/fs/exec.c b/fs/exec.c index 18594f11c31f..bb4751d84e2d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1781,7 +1781,7 @@ static int exec_binprm(struct linux_binprm *bprm) audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); - ptrace_event(PTRACE_EVENT_EXEC, old_vpid); + ptrace_post_event(PTRACE_EVENT_EXEC, old_vpid); proc_exec_connector(current); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index fd1c04193e18..aeb22a8e4d24 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -889,7 +889,9 @@ EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) { - do_exit((error_code&0xff)<<8); + long code = (error_code&0xff)<<8; + if (!ptrace_post_event(PTRACE_EVENT_EXIT, code)) + do_exit((error_code&0xff)<<8); } /* diff --git a/kernel/fork.c b/kernel/fork.c index dc06afd725cb..8533e056a3d6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1266,8 +1266,7 @@ static void complete_vfork_done(struct task_struct *tsk) task_unlock(tsk); } -static int wait_for_vfork_done(struct task_struct *child, - struct completion *vfork) +int wait_for_vfork_done(struct task_struct *child, struct completion *vfork) { int killed; @@ -2278,7 +2277,8 @@ static __latent_entropy struct task_struct *copy_process( init_task_pid_links(p); if (likely(p->pid)) { - ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); + ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || + (trace && ptrace_event_enabled(current, trace))); init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { @@ -2462,7 +2462,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) pid_t kernel_clone(struct kernel_clone_args *args) { u64 clone_flags = args->flags; - struct completion vfork; + unsigned long message; struct pid *pid; struct task_struct *p; int trace = 0; @@ -2495,9 +2495,6 @@ pid_t kernel_clone(struct kernel_clone_args *args) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; - - if (likely(!ptrace_event_enabled(current, trace))) - trace = 0; } p = copy_process(NULL, trace, NUMA_NO_NODE, args); @@ -2512,30 +2509,27 @@ pid_t kernel_clone(struct kernel_clone_args *args) */ trace_sched_process_fork(current, p); - pid = get_task_pid(p, PIDTYPE_PID); + pid = task_pid(p); nr = pid_vnr(pid); + message = pid_parent_nr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, args->parent_tid); - if (clone_flags & CLONE_VFORK) { - p->vfork_done = &vfork; + if (!(clone_flags & CLONE_VFORK)) { + wake_up_new_task(p); + ptrace_post_event(trace, message); + } + else if (!ptrace_post_event(PTRACE_EVENT_VFORK, (unsigned long)p)) { + struct completion vfork; init_completion(&vfork); + p->vfork_done = &vfork; get_task_struct(p); + wake_up_new_task(p); + if (wait_for_vfork_done(p, &vfork)) + ptrace_post_event(PTRACE_EVENT_VFORK_DONE, message); } - wake_up_new_task(p); - - /* forking complete and child started to run, tell ptracer */ - if (unlikely(trace)) - ptrace_event_pid(trace, pid); - - if (clone_flags & CLONE_VFORK) { - if (!wait_for_vfork_done(p, &vfork)) - ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); - } - - put_pid(pid); return nr; } diff --git a/kernel/signal.c b/kernel/signal.c index f7c6ffcbd044..8ac8c4a31d88 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -155,7 +155,8 @@ static bool recalc_sigpending_tsk(struct task_struct *t) if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked) || - cgroup_task_frozen(t)) { + cgroup_task_frozen(t) || + t->stop_code) { set_tsk_thread_flag(t, TIF_SIGPENDING); return true; } @@ -2607,6 +2608,39 @@ bool get_signal(struct ksignal *ksig) if (unlikely(current->task_works)) task_work_run(); +ptrace_event: + /* Handle a posted ptrace event */ + if (unlikely(current->stop_code)) { + int stop_code = current->stop_code; + unsigned long message = current->ptrace_message; + struct completion vfork; + struct task_struct *p; + + current->stop_code = 0; + + if (stop_code == PTRACE_EVENT_VFORK) { + p = (struct task_struct *)message; + get_task_struct(p); + current->ptrace_message = pid_parent_nr(task_pid(p)); + init_completion(&vfork); + p->vfork_done = &vfork; + wake_up_new_task(p); + } + + spin_lock_irq(&sighand->siglock); + ptrace_do_notify(SIGTRAP, stop_code, CLD_TRAPPED); + spin_unlock_irq(&sighand->siglock); + + if ((stop_code == PTRACE_EVENT_VFORK) && + wait_for_vfork_done(p, &vfork) && + ptrace_post_event(PTRACE_EVENT_VFORK_DONE, message)) + goto ptrace_event; + + if (stop_code == PTRACE_EVENT_EXIT) { + do_exit(message); + } + } + /* * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so * that the arch handlers don't all have to do it. If we get here Eric