On Fri, Jan 20, 2023 at 04:22:51PM +0100, Oleg Nesterov wrote: > Hi Gregory, > > I'll try to read this series next Monday, I need to recall what does > syscall-user-dispatch actually do ;) > > just one question for now, > > On 01/20, Gregory Price wrote: > > > > --- a/kernel/ptrace.c > > +++ b/kernel/ptrace.c > > @@ -370,6 +370,10 @@ static int check_ptrace_options(unsigned long data) > > if (data & ~(unsigned long)PTRACE_O_MASK) > > return -EINVAL; > > > > + if (unlikely(data & PTRACE_O_SUSPEND_SYSCALL_USER_DISPATCH) && > > + (!IS_ENABLED(CONFIG_CHECKPOINT_RESTART))) > > + return -EINVAL; > > Hmm? git grep CHECKPOINT_RESTART shows nothing. > > Oleg. > TIL the mailing lists don't like responses from proxy addresses. Resending response to it goes out to everyone Good catch, I always mixup RESTART/RESTORE. This should be RESTORE Adjusted patch below, will send a v4 tomorrow so as not to spam the lists. Attached an updated patch for the time being. (brief syscall user dispatch overview) syscall-user-dispatch is relatively simple, the goal is to implement syscall interposition for foreign syscalls (windows, non-posix, whatever). Since the ABI of these syscalls can't be trusted to be anything like linux, syscall dispatch produces a SIGSYS before anything else can do things like check register values. How to use 1) User registers a SIGSYS signal handler 2) User does prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, <address>, <length>, char* selector) 3) All 'syscall' instructions *outside* the virtual address range (address, address+length) now produce a SIGSYS on the thread that executed the syscall. <selector> can be set to SYSCALL_DISPATCH_FILTER_ALLOW or SYSCALL_DISPATCH_FILTER_BLOCK to enable/disable this signal production from userland without having to make kernel calls. docs: https://docs.kernel.org/admin-guide/syscall-user-dispatch.html Updated patch diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index eaaef3ffec22..461ae5c99d57 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -45,6 +45,8 @@ extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, #define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) #define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) +#define PT_SUSPEND_SYSCALL_USER_DISPATCH \ + (PTRACE_O_SUSPEND_SYSCALL_USER_DISPATCH << PT_OPT_FLAG_SHIFT) extern long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data); diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index 195ae64a8c87..ba9e3f19a22c 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -146,9 +146,13 @@ struct ptrace_rseq_configuration { /* eventless options */ #define PTRACE_O_EXITKILL (1 << 20) #define PTRACE_O_SUSPEND_SECCOMP (1 << 21) +#define PTRACE_O_SUSPEND_SYSCALL_USER_DISPATCH (1 << 22) #define PTRACE_O_MASK (\ - 0x000000ff | PTRACE_O_EXITKILL | PTRACE_O_SUSPEND_SECCOMP) + 0x000000ff | \ + PTRACE_O_EXITKILL | \ + PTRACE_O_SUSPEND_SECCOMP | \ + PTRACE_O_SUSPEND_SYSCALL_USER_DISPATCH) #include <asm/ptrace.h> diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index 0b6379adff6b..b5ec75164805 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -8,6 +8,7 @@ #include <linux/uaccess.h> #include <linux/signal.h> #include <linux/elf.h> +#include <linux/ptrace.h> #include <linux/sched/signal.h> #include <linux/sched/task_stack.h> @@ -36,6 +37,10 @@ bool syscall_user_dispatch(struct pt_regs *regs) struct syscall_user_dispatch *sd = ¤t->syscall_dispatch; char state; + if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SYSCALL_USER_DISPATCH)) + return false; + if (likely(instruction_pointer(regs) - sd->offset < sd->len)) return false; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 54482193e1ed..a348b68d07a2 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -370,6 +370,10 @@ static int check_ptrace_options(unsigned long data) if (data & ~(unsigned long)PTRACE_O_MASK) return -EINVAL; + if (unlikely(data & PTRACE_O_SUSPEND_SYSCALL_USER_DISPATCH) && + (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE))) + return -EINVAL; + if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || !IS_ENABLED(CONFIG_SECCOMP))