For simplicity and consistency, this patch provides an implementation for signal-based fault notification prior to the coredump of a child process. A new prctl command, PR_SET_PREDUMP_SIG, is defined that can be used by an application to express its interest and to specify the signal (SIGCHLD or SIGUSR1 or SIGUSR2) for such a notification. A new signal code (si_code), CLD_PREDUMP, is also defined for SIGCHLD. Background: As the coredump of a process may take time, in certain time-sensitive applications it is necessary for a parent process (e.g., a process manager) to be notified of a child's imminent death before the coredump so that the parent process can act sooner, such as re-spawning an application process, or initiating a control-plane fail-over. Currently there are two ways for a parent process to be notified of a child process's state change. One is to use the POSIX signal, and another is to use the kernel connector module. The specific events and actions are summarized as follows: Process Event POSIX Signal Connector-based ---------------------------------------------------------------------- ptrace_attach() do_notify_parent_cldstop() proc_ptrace_connector() SIGCHLD / CLD_STOPPED ptrace_detach() do_notify_parent_cldstop() proc_ptrace_connector() SIGCHLD / CLD_CONTINUED pre_coredump/ N/A proc_coredump_connector() get_signal() post_coredump/ do_notify_parent() proc_exit_connector() do_exit() SIGCHLD / exit_signal ---------------------------------------------------------------------- As shown in the table, the signal-based pre-coredump notification is not currently available. In some cases using a connector-based notification can be quite complicated (e.g., when a process manager is written in shell scripts and thus is subject to certain inherent limitations), and a signal-based notification would be simpler and better suited. Signed-off-by: Enke Chen <enkechen@xxxxxxxxx> --- arch/x86/kernel/signal_compat.c | 2 +- include/linux/sched.h | 4 ++ include/linux/signal.h | 5 +++ include/uapi/asm-generic/siginfo.h | 3 +- include/uapi/linux/prctl.h | 4 ++ kernel/fork.c | 1 + kernel/signal.c | 51 +++++++++++++++++++++++++ kernel/sys.c | 77 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 145 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 9ccbf05..a3deba8 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -30,7 +30,7 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(NSIGSEGV != 7); BUILD_BUG_ON(NSIGBUS != 5); BUILD_BUG_ON(NSIGTRAP != 5); - BUILD_BUG_ON(NSIGCHLD != 6); + BUILD_BUG_ON(NSIGCHLD != 7); BUILD_BUG_ON(NSIGSYS != 1); /* This is part of the ABI and can never change in size: */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 09026ea..cfb9645 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -696,6 +696,10 @@ struct task_struct { int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; + + /* The signal sent prior to a child's coredump: */ + int predump_signal; + /* JOBCTL_*, siglock protected: */ unsigned long jobctl; diff --git a/include/linux/signal.h b/include/linux/signal.h index 706a499..7cb976d 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -256,6 +256,11 @@ static inline int valid_signal(unsigned long sig) return sig <= _NSIG ? 1 : 0; } +static inline int valid_predump_signal(int sig) +{ + return (sig == SIGCHLD) || (sig == SIGUSR1) || (sig == SIGUSR2); +} + struct timespec; struct pt_regs; enum pid_type; diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index cb3d6c2..1a47cef 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -267,7 +267,8 @@ struct { \ #define CLD_TRAPPED 4 /* traced child has trapped */ #define CLD_STOPPED 5 /* child has stopped */ #define CLD_CONTINUED 6 /* stopped child has continued */ -#define NSIGCHLD 6 +#define CLD_PREDUMP 7 /* child is about to dump core */ +#define NSIGCHLD 7 /* * SIGPOLL (or any other signal without signal specific si_codes) si_codes diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index c0d7ea0..79f0a8a 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -219,4 +219,8 @@ struct prctl_mm_map { # define PR_SPEC_DISABLE (1UL << 2) # define PR_SPEC_FORCE_DISABLE (1UL << 3) +/* Whether to receive signal prior to child's coredump */ +#define PR_SET_PREDUMP_SIG 54 +#define PR_GET_PREDUMP_SIG 55 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 07cddff..c296c11 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1985,6 +1985,7 @@ static __latent_entropy struct task_struct *copy_process( p->dirty_paused_when = 0; p->pdeath_signal = 0; + p->predump_signal = 0; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; diff --git a/kernel/signal.c b/kernel/signal.c index 312b43e..eb4a483 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2337,6 +2337,44 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info) return signr; } +/* + * Let the parent, if so desired, know about the imminent death of a child + * prior to its coredump. + * + * Locking logic is similar to do_notify_parent_cldstop(). + */ +static void do_notify_parent_predump(struct task_struct *tsk) +{ + struct sighand_struct *sighand; + struct task_struct *parent; + struct kernel_siginfo info; + unsigned long flags; + int sig; + + parent = tsk->real_parent; + sig = parent->predump_signal; + + /* Check again with "tasklist_lock" locked by the caller */ + if (!valid_predump_signal(sig)) + return; + + clear_siginfo(&info); + info.si_signo = sig; + if (sig == SIGCHLD) + info.si_code = CLD_PREDUMP; + + rcu_read_lock(); + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); + info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), + task_uid(tsk)); + rcu_read_unlock(); + + sighand = parent->sighand; + spin_lock_irqsave(&sighand->siglock, flags); + __group_send_sig_info(sig, &info, parent); + spin_unlock_irqrestore(&sighand->siglock, flags); +} + bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; @@ -2497,6 +2535,19 @@ bool get_signal(struct ksignal *ksig) current->flags |= PF_SIGNALED; if (sig_kernel_coredump(signr)) { + /* + * Notify the parent prior to the coredump if the + * parent is interested in such a notificaiton. + */ + int p_sig = current->real_parent->predump_signal; + + if (valid_predump_signal(p_sig)) { + read_lock(&tasklist_lock); + do_notify_parent_predump(current); + read_unlock(&tasklist_lock); + cond_resched(); + } + if (print_fatal_signals) print_fatal_signal(ksig->info.si_signo); proc_coredump_connector(current); diff --git a/kernel/sys.c b/kernel/sys.c index 123bd73..43eb250d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2258,6 +2258,76 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } +static int prctl_get_predump_signal(struct task_struct *tsk, pid_t pid, + int __user *addr) +{ + struct task_struct *p; + int error; + + /* For the current task, the common case */ + if (pid == 0) { + put_user(tsk->predump_signal, addr); + return 0; + } + + error = -ESRCH; + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (p) { + error = 0; + put_user(p->predump_signal, addr); + } + rcu_read_unlock(); + return error; +} + +/* + * Returns true if current's euid is same as p's uid or euid, + * or has CAP_SYS_ADMIN. + * + * Called with rcu_read_lock, creds are safe. + * + * Adapted from set_one_prio_perm(). + */ +static bool set_predump_signal_perm(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred = __task_cred(p); + + return uid_eq(pcred->uid, cred->euid) || + uid_eq(pcred->euid, cred->euid) || + capable(CAP_SYS_ADMIN); +} + +static int prctl_set_predump_signal(struct task_struct *tsk, pid_t pid, int sig) +{ + struct task_struct *p; + int error; + + /* 0 is valid for disabling the feature */ + if (sig && !valid_predump_signal(sig)) + return -EINVAL; + + /* For the current task, the common case */ + if (pid == 0) { + tsk->predump_signal = sig; + return 0; + } + + error = -ESRCH; + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (p) { + if (!set_predump_signal_perm(p)) + error = -EPERM; + else { + error = 0; + p->predump_signal = sig; + } + } + rcu_read_unlock(); + return error; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2476,6 +2546,13 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; error = arch_prctl_spec_ctrl_set(me, arg2, arg3); break; + case PR_SET_PREDUMP_SIG: + error = prctl_set_predump_signal(me, (pid_t)arg2, (int)arg3); + break; + case PR_GET_PREDUMP_SIG: + error = prctl_get_predump_signal(me, (pid_t)arg2, + (int __user *)arg3); + break; default: error = -EINVAL; break; -- 1.8.3.1