Working on a safety-critical stress testing tool, using ptrace in an rather uncommon way (stop, peeking memory, ...) for a bunch of applications in an automated way I realized that once opened processes where restarted and PIDs recycled. Resulting in monitoring and manipulating the wrong processes. With the advent of pidfd we are now able to stick with one stable handle to identifying processes exactly. We now have the ability to get this race free. Sending signals now works like a charm, next step is to extend the functionality also for ptrace. API: long pidfd_ptrace(int pidfd, enum __ptrace_request request, void *addr, void *data, unsigned flags); Based on original ptrace, the following API changes where made: - Process identificator (pidfd) is now moved to start, this is aligned with pidfd_send_signal(int pidfd, ...) because potential future pidfd_* will have one thing in common: the pid identifier. I think is natural to have this argument upfront - Add an additional flags argument, not used now - but you never know All other arguments are identical compared to ptrace - no other modifications where made. Currently there are some pieces missing! This is just an early proposal for a new syscall. Still missing: - support for every architecture - re-use shared functions and move to common place - perf syscall registration - selftests - ...| Userspace Example: #define _GNU_SOURCE #include <errno.h> #include <sched.h> #include <fcntl.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <sys/user.h> #include <sys/ptrace.h> #include <sys/types.h> #include <sys/stat.h> #include <sys/wait.h> #include <linux/limits.h> #ifndef __NR_pidfd_ptrace #define __NR_pidfd_ptrace 439 #endif static inline long do_pidfd_ptrace(int pidfd, int request, void *addr, void *data, unsigned int flags) { #ifdef __NR_pidfd_ptrace return syscall(__NR_pidfd_ptrace, pidfd, request, addr, data, flags); #else return -ENOSYS; #endif } int main(int argc, char *argv[]) { int pid, pidfd, ret, sleep_time = 10; char pid_path[PATH_MAX]; struct user_regs_struct regs; if (argc < 2) { fprintf(stderr, "Usage: %s <pid>\n", argv[0]); goto err; } pid = atoi(argv[1]); sprintf(pid_path, "/proc/%d", pid); pidfd = open(pid_path, O_DIRECTORY | O_CLOEXEC); if (pidfd == -1) { fprintf(stderr, "failed to open %s\n", pid_path); goto err; } ret = do_pidfd_ptrace(pidfd, PTRACE_ATTACH, 0, 0, 0); if (ret < 0) { perror("do_pidfd_ptrace, PTRACE_ATTACH:"); goto err; } waitpid(pid, NULL, 0); ret = do_pidfd_ptrace(pidfd, PTRACE_GETREGS, NULL, ®s, 0); if (ret == -1) { perror("do_pidfd_ptrace, PTRACE_GETREGS:"); goto err; } printf("RIP: %llx\nRAX: %llx\nRCX: %llx\nRDX: %llx\nRSI: %llx\nRDI: %llx\n", regs.rip, regs.rax, regs.rcx, regs.rdx, regs.rsi, regs.rdi); fprintf(stdout, "stopping task for %d seconds\n", sleep_time); sleep(sleep_time); ret = do_pidfd_ptrace(pidfd, PTRACE_DETACH, 0, 0, 0); if (ret == -1) { perror("do_pidfd_ptrace, PTRACE_DETACH:"); goto err; } exit(EXIT_SUCCESS); err: exit(EXIT_FAILURE); } Cc: Christian Brauner <christian@xxxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: Borislav Petkov <bp@xxxxxxxxx> Cc: "H. Peter Anvin" <hpa@xxxxxxxxx> Cc: Arnd Bergmann <arnd@xxxxxxxx> Cc: Brian Gerst <brgerst@xxxxxxxxx> Cc: Sami Tolvanen <samitolvanen@xxxxxxxxxx> Cc: David Howells <dhowells@xxxxxxxxxx> Cc: Aleksa Sarai <cyphar@xxxxxxxxxx> Cc: Andy Lutomirski <luto@xxxxxxxxxx> Cc: Oleg Nesterov <oleg@xxxxxxxxxx> Cc: Eric W. Biederman <ebiederm@xxxxxxxxxxxx> Cc: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx> Cc: Sargun Dhillon <sargun@xxxxxxxxx> Cc: linux-api@xxxxxxxxxxxxxxx Cc: linux-arch@xxxxxxxxxxxxxxx Signed-off-by: Hagen Paul Pfeifer <hagen@xxxxxxxx> --- v2: - fixed a OOPS in __x64_sys_pidfd_ptrace+0x1bf/0x220 (call to __put_task_struct()) - add userland example --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 2 + include/uapi/asm-generic/unistd.h | 4 +- kernel/ptrace.c | 126 ++++++++++++++++++++----- kernel/sys_ni.c | 1 + 6 files changed, 113 insertions(+), 22 deletions(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 54581ac671b4..593f7fab90eb 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -442,3 +442,4 @@ 435 i386 clone3 sys_clone3 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd +438 i386 pidfd_ptrace sys_pidfd_ptrace diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 37b844f839bc..cd76d8343510 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -359,6 +359,7 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common pidfd_ptrace sys_pidfd_ptrace # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 1815065d52f3..254b071a5334 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1003,6 +1003,8 @@ asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, siginfo_t __user *info, unsigned int flags); asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags); +asmlinkage long sys_pidfd_ptrace(int pidfd, long request, unsigned long addr, + unsigned long data, unsigned int flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 3a3201e4618e..d62505742447 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -855,9 +855,11 @@ __SYSCALL(__NR_clone3, sys_clone3) __SYSCALL(__NR_openat2, sys_openat2) #define __NR_pidfd_getfd 438 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) +#define __NR_pidfd_ptrace 439 +__SYSCALL(__NR_pidfd_ptrace, sys_pidfd_ptrace) #undef __NR_syscalls -#define __NR_syscalls 439 +#define __NR_syscalls 440 /* * 32 bit systems traditionally used different diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 43d6179508d6..e9e7e3225b9a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -29,6 +29,7 @@ #include <linux/regset.h> #include <linux/hw_breakpoint.h> #include <linux/cn_proc.h> +#include <linux/proc_fs.h> #include <linux/compat.h> #include <linux/sched/signal.h> @@ -1239,10 +1240,39 @@ int ptrace_request(struct task_struct *child, long request, #define arch_ptrace_attach(child) do { } while (0) #endif +static inline long ptrace_call(struct task_struct *task, long request, unsigned long addr, + unsigned long data) +{ + long ret; + + if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { + ret = ptrace_attach(task, request, addr, data); + /* + * Some architectures need to do book-keeping after + * a ptrace attach. + */ + if (!ret) + arch_ptrace_attach(task); + goto out; + } + + ret = ptrace_check_attach(task, request == PTRACE_KILL || + request == PTRACE_INTERRUPT); + if (ret < 0) + goto out; + + ret = arch_ptrace(task, request, addr, data); + if (ret || request != PTRACE_DETACH) + ptrace_unfreeze_traced(task); + + out: + return ret; +} + SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, unsigned long, data) { - struct task_struct *child; + struct task_struct *task; long ret; if (request == PTRACE_TRACEME) { @@ -1252,35 +1282,89 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out; } - child = find_get_task_by_vpid(pid); - if (!child) { + task = find_get_task_by_vpid(pid); + if (!task) { ret = -ESRCH; goto out; } - if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, addr, data); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ + ret = ptrace_call(task, request, addr, data); + put_task_struct(task); +out: + return ret; +} + +static struct pid *pidfd_to_pid(const struct file *file) +{ + struct pid *pid; + + pid = pidfd_pid(file); + if (!IS_ERR(pid)) + return pid; + + return tgid_pidfd_to_pid(file); +} + +static bool access_pidfd_pidns(struct pid *pid) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *p = ns_of_pid(pid); + + for (;;) { + if (!p) + return false; + if (p == active) + break; + p = p->parent; + } + + return true; +} + +SYSCALL_DEFINE5(pidfd_ptrace, int, pidfd, long, request, unsigned long, addr, + unsigned long, data, unsigned int, flags) +{ + long ret; + struct fd f; + struct pid *pid; + struct task_struct *task; + + /* Enforce flags be set to 0 until we add an extension. */ + if (flags) + return -EINVAL; + + if (request == PTRACE_TRACEME) { + ret = ptrace_traceme(); if (!ret) - arch_ptrace_attach(child); - goto out_put_task_struct; + arch_ptrace_attach(current); + goto out; } - ret = ptrace_check_attach(child, request == PTRACE_KILL || - request == PTRACE_INTERRUPT); - if (ret < 0) - goto out_put_task_struct; + f = fdget(pidfd); + if (!f.file) + return -EBADF; - ret = arch_ptrace(child, request, addr, data); - if (ret || request != PTRACE_DETACH) - ptrace_unfreeze_traced(child); + /* Is this a pidfd? */ + pid = pidfd_to_pid(f.file); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto err; + } - out_put_task_struct: - put_task_struct(child); - out: + ret = -EINVAL; + if (!access_pidfd_pidns(pid)) + goto err; + + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + ret = -EINVAL; + goto err; + } + + ret = ptrace_call(task, request, addr, data); +err: + fdput(f); +out: return ret; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 3b69a560a7ac..f7795294b8c4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -166,6 +166,7 @@ COND_SYSCALL(delete_module); COND_SYSCALL(syslog); /* kernel/ptrace.c */ +COND_SYSCALL_COMPAT(pidfd_ptrace); /* kernel/sched/core.c */ -- 2.26.2