This change introduces the new system call: process_vm_exec(pid_t pid, struct sigcontext *uctx, unsigned long flags, siginfo_t * uinfo, sigset_t *sigmask, size_t sizemask) process_vm_exec allows to execute the current process in an address space of another process. process_vm_exec swaps the current address space with an address space of a specified process, sets a state from sigcontex and resumes the process. When a process receives a signal or calls a system call, process_vm_exec saves the process state back to sigcontext, restores the origin address space, restores the origin process state, and returns to userspace. If it was interrupted by a signal and the signal is in the user_mask, the signal is dequeued and information about it is saved in uinfo. If process_vm_exec is interrupted by a system call, a synthetic siginfo for the SIGSYS signal is generated. The behavior of this system call is similar to PTRACE_SYSEMU but everything is happing in the context of one process, so process_vm_exec shows a better performance. PTRACE_SYSEMU is primarily used to implement sandboxes (application kernels) like User-mode Linux or gVisor. These type of sandboxes intercepts applications system calls and acts as the guest kernel. A simple benchmark, where a "tracee" process executes systems calls in a loop and a "tracer" process traps syscalls and handles them just incrementing the tracee instruction pointer to skip the syscall instruction shows that process_vm_exec works more than 5 times faster than PTRACE_SYSEMU. Signed-off-by: Andrei Vagin <avagin@xxxxxxxxx> --- arch/Kconfig | 15 +++ arch/x86/Kconfig | 1 + arch/x86/entry/common.c | 16 +++ arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/x86/include/asm/sigcontext.h | 2 + arch/x86/kernel/Makefile | 1 + arch/x86/kernel/process_vm_exec.c | 133 +++++++++++++++++++++++++ arch/x86/kernel/signal.c | 47 +++++++++ include/linux/process_vm_exec.h | 15 +++ include/linux/sched.h | 7 ++ include/linux/syscalls.h | 6 ++ include/uapi/asm-generic/unistd.h | 4 +- kernel/fork.c | 9 ++ kernel/sys_ni.c | 2 + 14 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/process_vm_exec.c create mode 100644 include/linux/process_vm_exec.h diff --git a/arch/Kconfig b/arch/Kconfig index ba4e966484ab..3ed9b8fb1727 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -514,6 +514,21 @@ config SECCOMP_FILTER See Documentation/userspace-api/seccomp_filter.rst for details. +config HAVE_ARCH_PROCESS_VM_EXEC + bool + help + An arch should select this symbol to support the process_vm_exec system call. + +config PROCESS_VM_EXEC + prompt "Enable the process_vm_exec syscall" + def_bool y + depends on HAVE_ARCH_PROCESS_VM_EXEC + help + process_vm_exec allows executing code and system calls in a specified + address space. + + If unsure, say Y. + config HAVE_ARCH_STACKLEAK bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fbf26e0f7a6a..1c7ebb58865e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -27,6 +27,7 @@ config X86_64 select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY + select HAVE_ARCH_PROCESS_VM_EXEC select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select SWIOTLB diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 870efeec8bda..42eac459b25b 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -19,6 +19,7 @@ #include <linux/nospec.h> #include <linux/syscalls.h> #include <linux/uaccess.h> +#include <linux/process_vm_exec.h> #ifdef CONFIG_XEN_PV #include <xen/xen-ops.h> @@ -38,6 +39,21 @@ #ifdef CONFIG_X86_64 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { +#ifdef CONFIG_PROCESS_VM_EXEC + if (current->exec_mm && current->exec_mm->ctx) { + kernel_siginfo_t info = { + .si_signo = SIGSYS, + .si_call_addr = (void __user *)KSTK_EIP(current), + .si_arch = syscall_get_arch(current), + .si_syscall = nr, + }; + restore_vm_exec_context(regs); + regs->ax = copy_siginfo_to_user(current->exec_mm->siginfo, &info); + syscall_exit_to_user_mode(regs); + return; + } +#endif + nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 379819244b91..2a8e27b2d87e 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -362,6 +362,7 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 64 process_vm_exec sys_process_vm_exec # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 140d890c2c98..e390410cc3e9 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -6,4 +6,6 @@ #include <uapi/asm/sigcontext.h> +extern long swap_vm_exec_context(struct sigcontext __user *uctx); + #endif /* _ASM_X86_SIGCONTEXT_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 68608bd892c0..d053289fd19e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -163,3 +163,4 @@ ifeq ($(CONFIG_X86_64),y) endif obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o +obj-$(CONFIG_PROCESS_VM_EXEC) += process_vm_exec.o diff --git a/arch/x86/kernel/process_vm_exec.c b/arch/x86/kernel/process_vm_exec.c new file mode 100644 index 000000000000..28b32330f744 --- /dev/null +++ b/arch/x86/kernel/process_vm_exec.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <asm/syscall.h> +#include <asm/sigframe.h> +#include <asm/signal.h> +#include <asm/mmu_context.h> +#include <asm/sigcontext.h> + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/sched/mm.h> +#include <linux/syscalls.h> +#include <linux/vmacache.h> +#include <linux/process_vm_exec.h> + +static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm) +{ + struct task_struct *tsk = current; + struct mm_struct *active_mm; + + task_lock(tsk); + /* Hold off tlb flush IPIs while switching mm's */ + local_irq_disable(); + + sync_mm_rss(prev_mm); + + vmacache_flush(tsk); + + active_mm = tsk->active_mm; + if (active_mm != target_mm) { + mmgrab(target_mm); + tsk->active_mm = target_mm; + } + tsk->mm = target_mm; + switch_mm_irqs_off(active_mm, target_mm, tsk); + local_irq_enable(); + task_unlock(tsk); +#ifdef finish_arch_post_lock_switch + finish_arch_post_lock_switch(); +#endif + + if (active_mm != target_mm) + mmdrop(active_mm); +} + +void restore_vm_exec_context(struct pt_regs *regs) +{ + struct sigcontext __user *uctx; + struct mm_struct *prev_mm, *target_mm; + + uctx = current->exec_mm->ctx; + current->exec_mm->ctx = NULL; + + target_mm = current->exec_mm->mm; + current->exec_mm->mm = NULL; + prev_mm = current->mm; + + swap_mm(prev_mm, target_mm); + + mmput(prev_mm); + mmdrop(target_mm); + + swap_vm_exec_context(uctx); +} + +SYSCALL_DEFINE6(process_vm_exec, pid_t, pid, struct sigcontext __user *, uctx, + unsigned long, flags, siginfo_t __user *, uinfo, + sigset_t __user *, user_mask, size_t, sizemask) +{ + struct mm_struct *prev_mm, *mm; + struct task_struct *tsk; + long ret = -ESRCH; + + sigset_t mask; + + if (flags) + return -EINVAL; + + if (sizemask != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&mask, user_mask, sizeof(mask))) + return -EFAULT; + + sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + signotset(&mask); + + tsk = find_get_task_by_vpid(pid); + if (!tsk) { + ret = -ESRCH; + goto err; + } + mm = mm_access(tsk, PTRACE_MODE_ATTACH_REALCREDS); + put_task_struct(tsk); + if (!mm || IS_ERR(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto err; + } + + current_pt_regs()->ax = 0; + ret = swap_vm_exec_context(uctx); + if (ret < 0) + goto err_mm_put; + + if (!current->exec_mm) { + ret = -ENOMEM; + current->exec_mm = kmalloc(sizeof(*current->exec_mm), GFP_KERNEL); + if (current->exec_mm == NULL) + goto err_mm_put; + } + current->exec_mm->ctx = uctx; + current->exec_mm->mm = current->mm; + current->exec_mm->flags = flags; + current->exec_mm->sigmask = mask; + current->exec_mm->siginfo = uinfo; + prev_mm = current->mm; + + mmgrab(prev_mm); + swap_mm(prev_mm, mm); + + ret = current_pt_regs()->ax; + + return ret; +err_mm_put: + mmput(mm); +err: + return ret; +} + +void free_exec_mm_struct(struct task_struct *p) +{ + kfree(p->exec_mm); + p->exec_mm = NULL; +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cc269a20dd5f..51286c79062b 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -27,6 +27,7 @@ #include <linux/context_tracking.h> #include <linux/entry-common.h> #include <linux/syscalls.h> +#include <linux/process_vm_exec.h> #include <asm/processor.h> #include <asm/ucontext.h> @@ -816,6 +817,23 @@ void arch_do_signal(struct pt_regs *regs) { struct ksignal ksig; +#ifdef CONFIG_PROCESS_VM_EXEC + if (current->exec_mm && current->exec_mm->ctx) { + kernel_siginfo_t info; + int ret; + + restore_vm_exec_context(current_pt_regs()); + + spin_lock_irq(¤t->sighand->siglock); + ret = dequeue_signal(current, ¤t->exec_mm->sigmask, &info); + spin_unlock_irq(¤t->sighand->siglock); + + if (ret > 0) + ret = copy_siginfo_to_user(current->exec_mm->siginfo, &info); + regs->ax = ret; + } +#endif + if (get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ handle_signal(&ksig, regs); @@ -896,3 +914,32 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) return 0; } #endif + +#ifdef CONFIG_PROCESS_VM_EXEC +long swap_vm_exec_context(struct sigcontext __user *uctx) +{ + struct sigcontext ctx = {}; + sigset_t set = {}; + + + if (copy_from_user(&ctx, uctx, CONTEXT_COPY_SIZE)) + return -EFAULT; + /* A floating point state is managed from user-space. */ + if (ctx.fpstate != 0) + return -EINVAL; + if (!user_access_begin(uctx, sizeof(*uctx))) + return -EFAULT; + unsafe_put_sigcontext(uctx, NULL, current_pt_regs(), (&set), Efault); + user_access_end(); + + if (__restore_sigcontext(current_pt_regs(), &ctx, 0)) + goto badframe; + + return 0; +Efault: + user_access_end(); +badframe: + signal_fault(current_pt_regs(), uctx, "swap_vm_exec_context"); + return -EFAULT; +} +#endif diff --git a/include/linux/process_vm_exec.h b/include/linux/process_vm_exec.h new file mode 100644 index 000000000000..a02535fbd5c8 --- /dev/null +++ b/include/linux/process_vm_exec.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PROCESS_VM_EXEC_H +#define _LINUX_PROCESS_VM_EXEC_H + +struct exec_mm { + struct sigcontext *ctx; + struct mm_struct *mm; + unsigned long flags; + sigset_t sigmask; + siginfo_t __user *siginfo; +}; + +void free_exec_mm_struct(struct task_struct *tsk); + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 76cd21fa5501..864a8fdd0ed7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -64,6 +64,7 @@ struct signal_struct; struct task_delay_info; struct task_group; struct io_uring_task; +struct exec_mm; /* * Task state bitmask. NOTE! These bits are also @@ -637,6 +638,8 @@ struct wake_q_node { struct wake_q_node *next; }; +struct exec_mm; + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -757,6 +760,10 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; +#ifdef CONFIG_PROCESS_VM_EXEC + struct exec_mm *exec_mm; +#endif + /* Per-thread vma caching: */ struct vmacache vmacache; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 37bea07c12f2..bdea75a14975 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1347,4 +1347,10 @@ int __sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen); int __sys_setsockopt(int fd, int level, int optname, char __user *optval, int optlen); + +#ifdef CONFIG_PROCESS_VM_EXEC +void restore_vm_exec_context(struct pt_regs *regs); +#else +static inline void restore_vm_exec_context(struct pt_regs *regs) {} +#endif #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 2056318988f7..60acbd9cf511 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_process_madvise 440 __SYSCALL(__NR_process_madvise, sys_process_madvise) +#define __NR_process_madvise 441 +__SYSCALL(__NR_process_vm_exec, sys_process_vm_exec) #undef __NR_syscalls -#define __NR_syscalls 441 +#define __NR_syscalls 442 /* * 32 bit systems traditionally used different diff --git a/kernel/fork.c b/kernel/fork.c index 6d266388d380..61ca7a4a1130 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,6 +96,7 @@ #include <linux/kasan.h> #include <linux/scs.h> #include <linux/io_uring.h> +#include <linux/process_vm_exec.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -461,6 +462,9 @@ void free_task(struct task_struct *tsk) arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); +#ifdef CONFIG_PROCESS_VM_EXEC + free_exec_mm_struct(tsk); +#endif free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -943,6 +947,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif + +#ifdef CONFIG_PROCESS_VM_EXEC + tsk->exec_mm = NULL; +#endif + return tsk; free_stack: diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index f27ac94d5fa7..2545a409bb07 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -350,6 +350,8 @@ COND_SYSCALL(pkey_mprotect); COND_SYSCALL(pkey_alloc); COND_SYSCALL(pkey_free); +/* execute in another address space */ +COND_SYSCALL(process_vm_exec); /* * Architecture specific weak syscall entries. -- 2.29.2