Signed-off-by: Florian Weimer <fweimer@xxxxxxxxxx> --- v3: Patch split out. tools/arch/x86/include/uapi/asm/prctl.h | 2 + tools/testing/selftests/x86/Makefile | 7 +- .../testing/selftests/x86/vsyscall_control.c | 891 ++++++++++++++++++ 3 files changed, 899 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/vsyscall_control.c diff --git a/tools/arch/x86/include/uapi/asm/prctl.h b/tools/arch/x86/include/uapi/asm/prctl.h index 754a07856817..aad0bcfbf49f 100644 --- a/tools/arch/x86/include/uapi/asm/prctl.h +++ b/tools/arch/x86/include/uapi/asm/prctl.h @@ -18,4 +18,6 @@ #define ARCH_MAP_VDSO_32 0x2002 #define ARCH_MAP_VDSO_64 0x2003 +#define ARCH_VSYSCALL_CONTROL 0x5001 + #endif /* _ASM_X86_PRCTL_H */ diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 0993d12f2c38..4c751836bfeb 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -18,7 +18,7 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \ - corrupt_xstate_header amx + corrupt_xstate_header amx vsyscall_control # Some selftests require 32bit support enabled also on 64bit systems TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall @@ -107,3 +107,8 @@ $(OUTPUT)/test_syscall_vdso_32: thunks_32.S # state. $(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static $(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static + +# This test does not link against anything (neither libc nor libgcc). +$(OUTPUT)/vsyscall_control_64: \ + LIBS := -Wl,-no-pie -static -nostdlib -nostartfiles + CFLAGS += -fno-pie -fno-stack-protector -fno-builtin -ffreestanding diff --git a/tools/testing/selftests/x86/vsyscall_control.c b/tools/testing/selftests/x86/vsyscall_control.c new file mode 100644 index 000000000000..ee966f936c89 --- /dev/null +++ b/tools/testing/selftests/x86/vsyscall_control.c @@ -0,0 +1,891 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vsyscall_lockout.c - check that disabling vsyscall works + * Copyright (C) 2021 Red Hat, Inc. + * + * This test requires vsyscall=xonly or vsyscall=emulate. With + * vsyscall=emulate, ARCH_VSYSCALL_CONTROL cannot turn off vsyscall + * completely (reads still work), but this is not tested here. + */ + +#include <stddef.h> + +#include <asm/prctl.h> +#include <asm/vsyscall.h> +#include <linux/errno.h> +#include <linux/fcntl.h> +#include <linux/signal.h> +#include <linux/time.h> +#include <linux/types.h> +#include <linux/unistd.h> + +#ifndef ARCH_VSYSCALL_CONTROL +#define ARCH_VSYSCALL_CONTROL 0x5001 +#elif ARCH_VSYSCALL_CONTROL != 0x5001 +#error wrong vlaue for ARCH_VSYSCALL_CONTROL +#endif + + +static inline long syscall0(int nr) +{ + unsigned long result; + + __asm__ volatile ("syscall" + : "=a" (result) + : "0" (nr) + : "memory", "cc", "r11", "cx"); + return result; +} + +static inline long syscall1(int nr, long arg0) +{ + register long rdi asm("rdi") = arg0; + unsigned long result; + + __asm__ volatile ("syscall" + : "=a" (result) + : "0" (nr), "r" (rdi) + : "memory", "cc", "r11", "cx"); + return result; +} + +static inline long syscall2(int nr, long arg0, long arg1) +{ + register long rdi asm("rdi") = arg0; + register long rsi asm("rsi") = arg1; + unsigned long result; + + __asm__ volatile ("syscall" + : "=a" (result) + : "0" (nr), "r" (rdi), "r" (rsi) + : "memory", "cc", "r11", "cx"); + return result; +} + +static inline long syscall3(int nr, long arg0, long arg1, long arg2) +{ + register long rdi asm("rdi") = arg0; + register long rsi asm("rsi") = arg1; + register long rdx asm("rdx") = arg2; + unsigned long result; + + __asm__ volatile ("syscall" + : "=a" (result) + : "0" (nr), "r" (rdi), "r" (rsi), "r" (rdx) + : "memory", "cc", "r11", "cx"); + return result; +} + +static inline long syscall4(int nr, long arg0, long arg1, long arg2, long arg3) +{ + register long rdi asm("rdi") = arg0; + register long rsi asm("rsi") = arg1; + register long rdx asm("rdx") = arg2; + register long r10 asm("r10") = arg3; + unsigned long result; + + __asm__ volatile ("syscall" + : "=a" (result) + : "0" (nr), "r" (rdi), "r" (rsi), "r" (rdx), + "r" (r10) + : "memory", "cc", "r11", "cx"); + return result; +} + +static inline long syscall5(int nr, long arg0, long arg1, long arg2, long arg3, + long arg4) +{ + register long rdi asm("rdi") = arg0; + register long rsi asm("rsi") = arg1; + register long rdx asm("rdx") = arg2; + register long r10 asm("r10") = arg3; + register long r8 asm("r8") = arg4; + unsigned long result; + + __asm__ volatile ("syscall" + : "=a" (result) + : "0" (nr), "r" (rdi), "r" (rsi), "r" (rdx), + "r" (r10), "r" (r8) + : "memory", "cc", "r11", "cx"); + return result; +} + +static inline long vsyscall2(long addr, long arg0, long arg1) +{ + register long rdi asm("rdi") = arg0; + register long rsi asm("rsi") = arg1; + unsigned long result; + + __asm__ volatile ("callq *%%rax" + : "=a" (result) + : "0" (addr), "r" (rdi), "r" (rsi) + : "memory", "cc", "r11", "cx"); + return result; +} + +static void __attribute__ ((noreturn)) sys_exit(int status) +{ + syscall1(__NR_exit, status); + __builtin_unreachable(); +} + +static int sys_access(const char *pathname, int mode) +{ + return syscall2(__NR_access, (long) pathname, mode); +} + +static int sys_mkdir(const char *pathname, __kernel_mode_t mode) +{ + return syscall2(__NR_mkdir, (long) pathname, mode); +} + +static int sys_open(const char *pathname, int flags, __kernel_mode_t mode) +{ + return syscall3(__NR_open, (long) pathname, flags, mode); +} + +static long sys_read(int fd, void *buffer, size_t length) +{ + return syscall3(__NR_read, fd, (long) buffer, length); +} + +static long sys_write(int fd, const void *buffer, size_t length) +{ + return syscall3(__NR_write, fd, (long) buffer, length); +} + +static int sys_mount(const char *source, const char *pathname, + const char *fstype, unsigned long flags, + const void *data) +{ + return syscall5(__NR_mount, (long) source, (long) pathname, + (long) fstype, flags, (long) data); +} + +static void sigabrt(void) +{ + syscall2(__NR_kill, syscall0(__NR_getpid), SIGABRT); +} + +/* + * String buffers. + */ + +struct buffer { + char *position; + char *limit; +}; + +static void buffer_init(struct buffer *b, char *start, size_t length) +{ + b->position = start; + b->limit = start + length; +} + +static void buffer_append(struct buffer *b, char ch) +{ + if (b->position >= b->limit) + sigabrt(); + *b->position = ch; + ++b->position; +} + +static void buffer_append_string(struct buffer *b, const char *p) +{ + while (*p) { + buffer_append(b, *p); + ++p; + } +} + +static void buffer_append_dec_1(struct buffer *b, unsigned long val) +{ + if (val != 0) { + buffer_append_dec_1(b, val / 10); + buffer_append(b, '0' + (val % 10)); + } +} + +static void buffer_append_dec(struct buffer *b, unsigned long val) +{ + if (val == 0) { + buffer_append(b, '0'); + return; + } + buffer_append_dec_1(b, val); +} + +/* + * Output to standard output. + */ + +static void print_char(char byte) +{ + if (sys_write(1, &byte, 1) < 0) + sigabrt(); +} + +static void print_string(const char *p) +{ + while (*p) { + print_char(*p); + ++p; + } +} + +static void print_dec_1(unsigned long val) +{ + if (val != 0) { + print_dec_1(val / 10); + print_char('0' + (val % 10)); + } +} + +static void print_dec(unsigned long val) +{ + if (val == 0) + print_char('0'); + else + print_dec_1(val); +} + +static void print_signed_dec(long val) +{ + if (val < 0) { + print_char('-'); + print_dec(-(unsigned long)val); + } else + print_dec(val); +} + +static void print_message(unsigned int lineno, const char *tag, const char *p) +{ + print_string(__FILE__); + print_char(':'); + print_dec(lineno); + print_char(':'); + print_char(' '); + print_string(tag); + print_char(':'); + print_char(' '); + print_string(p); +} + +static void print_info(unsigned int lineno, const char *p) +{ + print_message(lineno, "info", p); +} + +static void print_error(unsigned int lineno, const char *p) +{ + print_message(lineno, "ERROR", p); +} + +static void print_failure(int lineno, const char *label, long ret) +{ + print_error(lineno, label); + print_string(" failed: "); + print_signed_dec(ret); + print_char('\n'); +} + +static void print_time(int lineno, const char *label, struct timeval tv) +{ + print_info(lineno, label); + print_string(": "); + print_dec(tv.tv_sec); + print_char(' '); + print_dec(tv.tv_usec); + print_char('\n'); +} + +/* + * Process-failing (v)syscall wrappers. + */ + +static void xgettimeofday(struct timeval *tv) +{ + long ret = syscall2(__NR_gettimeofday, (long) tv, 0); + + if (ret != 0) { + print_failure(__LINE__, "gettimeofday", ret); + sigabrt(); + } +} + +static void xvgettimeofday(struct timeval *tv) +{ + long ret = vsyscall2(VSYSCALL_ADDR, (long) tv, 0); + + if (ret) { + print_failure(__LINE__, "vgettimeofday", ret); + sigabrt(); + } +} + +static int sys_arch_prctl(int code, unsigned long addr) +{ + return syscall2(__NR_arch_prctl, code, addr); +} + +static void xclose(int fd) +{ + long ret = syscall1(__NR_close, fd); + + if (ret < 0) { + print_failure(__LINE__, "close", ret); + sigabrt(); + } +} + +static void xwrite_byte(int fd, char b) +{ + long ret = sys_write(fd, &b, 1); + + if (ret != 1) { + print_failure(__LINE__, "write", ret); + sigabrt(); + } +} + +static int xread_byte(int fd) +{ + char b; + long ret = sys_read(fd, &b, 1); + + if (ret != 1) { + print_failure(__LINE__, "read", ret); + sigabrt(); + } + return b; +} + +static void xpipe(int fds[static 2]) +{ + long ret = syscall2(__NR_pipe2, (long) fds, O_CLOEXEC); + + if (ret != 0) { + print_failure(__LINE__, "pipe2", ret); + sigabrt(); + } +} + +static __kernel_pid_t xfork(void) +{ + long ret = syscall0(__NR_fork); + + if (ret < 0) { + print_failure(__LINE__, "fork", ret); + sigabrt(); + } + return ret; +} + +static void xexecve(const char *pathname, char **argv, char **envp) +{ + long ret; + + ret = syscall3(__NR_execve, (long) pathname, (long) argv, (long) envp); + print_failure(__LINE__, "execve", ret); + sigabrt(); +} + +static __kernel_pid_t xwaitpid(__kernel_pid_t pid, int *status, int options) +{ + long ret = syscall4(__NR_wait4, pid, (long) status, options, 0); + + if (ret < 0) { + print_failure(__LINE__, "wait4", ret); + sigabrt(); + } + return ret; +} + +/* + * Various helpers. + */ + +static void vsyscall_disable(void) +{ + long ret = sys_arch_prctl(ARCH_VSYSCALL_CONTROL, 0); + + if (ret != 0) + print_failure(__LINE__, "arch_prctl(ARCH_VSYSCALL_CONTROL, 0)", ret); +} + +static void vsyscall_enable(void) +{ + long ret = sys_arch_prctl(ARCH_VSYSCALL_CONTROL, 1); + + if (ret != 0) + print_failure(__LINE__, "arch_prctl(ARCH_VSYSCALL_CONTROL, 1)", ret); +} + +static long difftime(struct timeval first, struct timeval second) +{ + return second.tv_usec - first.tv_usec + + (second.tv_sec - first.tv_sec) * 1000 * 1000; +} + +static void ensure_proc_is_mounted(int *status) +{ + int ret; + + if (sys_access("/proc/version", 0) == 0) + return; + + ret = sys_mkdir("/proc", 0555); + if (ret == EEXIST) + return; + if (ret != 0) { + print_failure(__LINE__, "could not create /proc", ret); + *status = 1; + return; + } + ret = sys_mount("none", "/proc", "proc", 0, NULL); + if (ret != 0) { + print_failure(__LINE__, "could not mount /proc", ret); + *status = 1; + return; + } + if (sys_access("/proc/version", 0) != 0) { + print_error(__LINE__, "no /proc/version after mounting /proc"); + *status = 1; + return; + } +} + +/* + * Individual subtest functions. + */ + +static void check_time(int *status) +{ + struct timeval initial_time = { -1, -1 }; + struct timeval vsyscall_time = { -1, -1 }; + struct timeval final_time = { -1, -1 }; + long vsyscall_diff, final_diff; + + xgettimeofday(&initial_time); + xvgettimeofday(&vsyscall_time); + xgettimeofday(&final_time); + vsyscall_diff = difftime(initial_time, vsyscall_time); + final_diff = difftime(vsyscall_time, final_time); + + print_time(__LINE__, "initial gettimeofday", initial_time); + print_time(__LINE__, "vsyscall gettimeofday", vsyscall_time); + print_time(__LINE__, "final gettimeofday", final_time); + + if (initial_time.tv_sec < 0 || initial_time.tv_usec < 0 || + vsyscall_time.tv_sec < 0 || vsyscall_time.tv_usec < 0 || + final_time.tv_sec < 0 || final_time.tv_usec < 0) { + print_error(__LINE__, "negative time\n"); + *status = 1; + } + + print_info(__LINE__, "differences: "); + print_signed_dec(vsyscall_diff); + print_char(' '); + print_signed_dec(final_diff); + print_char('\n'); + + if (vsyscall_diff < 0 || final_diff < 0) { + /* + * This may produce false positives if there is an active NTP. + */ + print_error(__LINE__, "time went backwards\n"); + *status = 1; + } +} + +static void check_lockout_after_fork(int *status, int twice) +{ + __kernel_pid_t pid; + struct timeval vsyscall_time; + int wstatus; + + if (twice) { + __kernel_pid_t pid_outer; + + print_info(__LINE__, "checking that lockout is inherited by fork\n"); + + pid_outer = xfork(); + if (pid_outer == 0) { + vsyscall_disable(); + /* + * Logic for the subprocess follows below. + */ + } else { + xwaitpid(pid_outer, &wstatus, 0); + if (wstatus != 0) { + print_error(__LINE__, "unexpected exit status: "); + print_signed_dec(wstatus); + print_char('\n'); + *status = 1; + } + return; + } + } else + print_info(__LINE__, "checking that lockout works after one fork\n"); + + pid = xfork(); + if (pid == 0) { + if (!twice) + vsyscall_disable(); + /* + * This should trigger a fault. + */ + xvgettimeofday(&vsyscall_time); + sys_exit(0); + } + xwaitpid(pid, &wstatus, 0); + switch (wstatus) { + case 0: + print_error(__LINE__, "no crash after lockout\n"); + *status = 1; + break; + case 0x0100: + *status = 1; + break; + case SIGSEGV: + print_info(__LINE__, "termination after lockout\n"); + break; + default: + print_error(__LINE__, "unexpected exit status: "); + print_signed_dec(wstatus); + print_char('\n'); + *status = 1; + } + + if (twice) + sys_exit(*status); + + /* + * Status in the parent process should be unaffected. + */ + xvgettimeofday(&vsyscall_time); +} + +static void check_no_lockout_after_enable(int *status) +{ + __kernel_pid_t pid; + struct timeval vsyscall_time; + int wstatus; + + print_info(__LINE__, "checking that vsyscall can be re-enabled\n"); + + pid = xfork(); + if (pid == 0) { + vsyscall_disable(); + vsyscall_enable(); + xvgettimeofday(&vsyscall_time); + print_time(__LINE__, "vsyscall time after re-enable", vsyscall_time); + sys_exit(0); + } + + xwaitpid(pid, &wstatus, 0); + if (wstatus != 0) { + print_error(__LINE__, "unexpected exit status: "); + print_signed_dec(wstatus); + print_char('\n'); + *status = 1; + } +} + +static void check_no_lockout_after_execve(char **argv, int *status) +{ + __kernel_pid_t pid; + int wstatus; + + print_info(__LINE__, "checking that lockout is not inherited by execve\n"); + pid = xfork(); + if (pid == 0) { + struct timeval vsyscall_time; + char *new_argv[3]; + + xvgettimeofday(&vsyscall_time); + vsyscall_disable(); + + /* + * Re-exec the second stage. See main_2 below. + */ + new_argv[0] = argv[0]; + new_argv[1] = "2"; + new_argv[2] = NULL; + xexecve(argv[0], new_argv, new_argv + 2); + } + + xwaitpid(pid, &wstatus, 0); + if (wstatus != 0) { + print_error(__LINE__, "unexpected exit status: "); + print_signed_dec(wstatus); + print_char('\n'); + *status = 1; + } +} + +static int has_vsyscall_map(int pid, int *status) +{ + int result = 0; + char maps_path[50]; + int fd; + + /* + * Construct /proc/PID/maps path. + */ + { + struct buffer b; + + buffer_init(&b, maps_path, sizeof(maps_path)); + buffer_append_string(&b, "/proc/"); + if (pid == 0) + buffer_append_string(&b, "self"); + else + buffer_append_dec(&b, pid); + buffer_append_string(&b, "/maps"); + buffer_append(&b, 0); + } + + fd = sys_open(maps_path, O_RDONLY, 0); + if (fd < 0) { + print_error(__LINE__, "maps file "); + print_string(maps_path); + print_string(": "); + print_signed_dec(fd); + print_char('\n'); + *status = 1; + } + + /* + * Search for "[vsyscall]\n". + */ + { + char buf[4096]; + long ret; + + for (size_t i = 0; i < sizeof(buf); ++i) + buf[i] = 0; + ret = sys_read(fd, buf, sizeof(buf)); + + if (ret < 0 || ret >= sizeof(buf)) + print_failure(__LINE__, "read", ret); + else { + char *bracket = buf; + + while (1) { + while (*bracket && *bracket != '[') + ++bracket; + if (*bracket != '[') + /* + * End of file has been reached. + */ + break; + ++bracket; + if (bracket[0] == 'v' + && bracket[1] == 's' + && bracket[2] == 'y' + && bracket[3] == 's' + && bracket[4] == 'c' + && bracket[5] == 'a' + && bracket[6] == 'l' + && bracket[7] == 'l' + && bracket[8] == ']' + && bracket[9] == '\n') { + result = 1; + break; + } + } + } + } + + xclose(fd); + return result; +} + +static void check_vsyscall_in_self_maps(int *status) +{ + __kernel_pid_t pid; + int wstatus; + + print_info(__LINE__, "checking [vsyscall] in /proc/self/maps\n"); + + pid = xfork(); + if (pid == 0) { + if (!has_vsyscall_map(0, status)) { + print_error(__LINE__, "[vsyscall] missing\n"); + *status = 1; + } + vsyscall_disable(); + if (has_vsyscall_map(0, status)) { + print_error(__LINE__, "[vsyscall] present after disable\n"); + *status = 1; + } + vsyscall_enable(); + if (!has_vsyscall_map(0, status)) { + print_error(__LINE__, "[vsyscall] missing after enable\n"); + *status = 1; + } + sys_exit(*status); + } + + xwaitpid(pid, &wstatus, 0); + if (wstatus != 0) { + print_error(__LINE__, "unexpected exit status: "); + print_signed_dec(wstatus); + print_char('\n'); + *status = 1; + } +} + +static void check_vsyscall_maps_for_subprocess(int *status) +{ + __kernel_pid_t pid; + int wstatus; + int outer_to_inner[2]; + int inner_to_outer[2]; + + /* + * Create pipes used to synchronize the two processes. + */ + xpipe(outer_to_inner); + xpipe(inner_to_outer); + + print_info(__LINE__, "checking [vsyscall] in /proc/PID/maps\n"); + + pid = xfork(); + if (pid == 0) { + xclose(outer_to_inner[1]); + xclose(inner_to_outer[0]); + + xread_byte(outer_to_inner[0]); + vsyscall_disable(); + xwrite_byte(inner_to_outer[1], 101); + + xread_byte(outer_to_inner[0]); + vsyscall_disable(); + xwrite_byte(inner_to_outer[1], 102); + + sys_exit(0); + } + + xclose(outer_to_inner[0]); + xclose(inner_to_outer[1]); + + if (!has_vsyscall_map(pid, status)) { + print_error(__LINE__, "subprocess starts without [vsyscall]"); + *status = 1; + } + + xwrite_byte(outer_to_inner[1], 1); + xread_byte(inner_to_outer[0]); + + if (has_vsyscall_map(pid, status)) { + print_error(__LINE__, "subprocess has [vsyscall] after disable"); + *status = 1; + } + + xwrite_byte(outer_to_inner[1], 2); + xread_byte(inner_to_outer[0]); + + if (has_vsyscall_map(pid, status)) { + print_error(__LINE__, "subprocess lacks [vsyscall] after enable"); + *status = 1; + } + + xclose(outer_to_inner[1]); + xclose(inner_to_outer[0]); + + xwaitpid(pid, &wstatus, 0); + if (wstatus != 0) { + print_error(__LINE__, "unexpected exit status: "); + print_signed_dec(wstatus); + print_char('\n'); + *status = 1; + } +} + +static void check_einval(int *status) +{ + long ret; + + ret = sys_arch_prctl(ARCH_VSYSCALL_CONTROL, 2); + if (ret != -EINVAL) { + print_error(__LINE__, "arch_prctl(ARCH_VSYSCALL_CONTROL, 2) returned "); + print_signed_dec(ret); + print_char('\n'); + *status = 1; + } +} + +/* + * Second stage: Check that the lockout is not inherited across execve. + * Used from check_no_lockout_after_execve. + */ +static int main_2(void) +{ + struct timeval vsyscall_time = { -1, -1 }; + int status = 0; + + xvgettimeofday(&vsyscall_time); + print_time(__LINE__, "vsyscall gettimeofday after fork", vsyscall_time); + if (vsyscall_time.tv_sec < 0 || vsyscall_time.tv_usec < 0) + status = 1; + + return status; +} + +static int main(int argc, char **argv) +{ + int status = 0; + + if (argc > 1) { + switch (*argv[1]) { + case '2': + return main_2(); + default: + print_string("usage: "); + print_string(argv[0]); + print_string("\n"); + return 1; + } + } + + ensure_proc_is_mounted(&status); + + if (has_vsyscall_map(0, &status)) + print_info(__LINE__, "vsyscall active at process start\n"); + else + print_error(__LINE__, "vsyscall inactive at process start\n"); + + + check_time(&status); + check_lockout_after_fork(&status, 0); + check_lockout_after_fork(&status, 1); + check_no_lockout_after_enable(&status); + check_no_lockout_after_execve(argv, &status); + check_vsyscall_in_self_maps(&status); + check_vsyscall_maps_for_subprocess(&status); + check_einval(&status); + + print_info(__LINE__, "testing done, exit status: "); + print_signed_dec(status); + print_char('\n'); + return status; +} + +static void __attribute__ ((used)) main_trampoline(long *rsp) +{ + sys_exit(main(*rsp, (char **) (rsp + 1))); +} + +__asm__(".text\n\t" + ".globl _start\n" + "_start:\n\t" + ".cfi_startproc\n\t" + ".cfi_undefined rip\n\t" + "movq %rsp, %rdi\n\t" + "callq main_trampoline\n\t" /* Results in psABI %rsp alignment. */ + ".cfi_endproc\n\t" + ".type _start, @function\n\t" + ".size _start, . - _start\n\t" + ".previous"); -- 2.33.1