The following commit has been merged into the x86/cpu branch of tip: Commit-ID: daffd8a21847b2a37da6b4d23753a4581543c575 Gitweb: https://git.kernel.org/tip/daffd8a21847b2a37da6b4d23753a4581543c575 Author: Brian Gerst <brgerst@xxxxxxxxx> AuthorDate: Thu, 13 Mar 2025 14:22:34 -04:00 Committer: Ingo Molnar <mingo@xxxxxxxxxx> CommitterDate: Fri, 14 Mar 2025 10:32:51 +01:00 x86/syscall/64: Move the 64-bit syscall dispatch code to arch/x86/entry/syscall_64.c Move the 64-bit syscall dispatch code to syscall_64.c. No functional changes. Signed-off-by: Brian Gerst <brgerst@xxxxxxxxx> Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx> Reviewed-by: Sohil Mehta <sohil.mehta@xxxxxxxxx> Cc: Andy Lutomirski <luto@xxxxxxxxxx> Cc: Juergen Gross <jgross@xxxxxxxx> Cc: H. Peter Anvin <hpa@xxxxxxxxx> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Cc: Josh Poimboeuf <jpoimboe@xxxxxxxxxx> Link: https://lore.kernel.org/r/20250313182236.655724-4-brgerst@xxxxxxxxx --- arch/x86/entry/Makefile | 2 +- arch/x86/entry/common.c | 93 +-------------------------------- arch/x86/entry/syscall_64.c | 103 ++++++++++++++++++++++++++++++++++- 3 files changed, 103 insertions(+), 95 deletions(-) diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 96a6b86..5fd28ab 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -9,9 +9,11 @@ KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) CFLAGS_common.o += -fno-stack-protector CFLAGS_syscall_32.o += -fno-stack-protector +CFLAGS_syscall_64.o += -fno-stack-protector obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o obj-y += common.o diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 183efab..5bd448c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -32,99 +32,6 @@ #include <asm/syscall.h> #include <asm/irq_stack.h> -#ifdef CONFIG_X86_64 - -static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) -{ - /* - * Convert negative numbers to very high and thus out of range - * numbers for comparisons. - */ - unsigned int unr = nr; - - if (likely(unr < NR_syscalls)) { - unr = array_index_nospec(unr, NR_syscalls); - regs->ax = x64_sys_call(regs, unr); - return true; - } - return false; -} - -static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) -{ - /* - * Adjust the starting offset of the table, and convert numbers - * < __X32_SYSCALL_BIT to very high and thus out of range - * numbers for comparisons. - */ - unsigned int xnr = nr - __X32_SYSCALL_BIT; - - if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { - xnr = array_index_nospec(xnr, X32_NR_syscalls); - regs->ax = x32_sys_call(regs, xnr); - return true; - } - return false; -} - -/* Returns true to return using SYSRET, or false to use IRET */ -__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) -{ - add_random_kstack_offset(); - nr = syscall_enter_from_user_mode(regs, nr); - - instrumentation_begin(); - - if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { - /* Invalid system call, but still a system call. */ - regs->ax = __x64_sys_ni_syscall(regs); - } - - instrumentation_end(); - syscall_exit_to_user_mode(regs); - - /* - * Check that the register state is valid for using SYSRET to exit - * to userspace. Otherwise use the slower but fully capable IRET - * exit path. - */ - - /* XEN PV guests always use the IRET path */ - if (cpu_feature_enabled(X86_FEATURE_XENPV)) - return false; - - /* SYSRET requires RCX == RIP and R11 == EFLAGS */ - if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) - return false; - - /* CS and SS must match the values set in MSR_STAR */ - if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) - return false; - - /* - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. - * - * TASK_SIZE_MAX covers all user-accessible addresses other than - * the deprecated vsyscall page. - */ - if (unlikely(regs->ip >= TASK_SIZE_MAX)) - return false; - - /* - * SYSRET cannot restore RF. It can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. - */ - if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) - return false; - - /* Use SYSRET to exit to userspace */ - return true; -} -#endif - SYSCALL_DEFINE0(ni_syscall) { return -ENOSYS; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index ba83544..9e0ba33 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -1,10 +1,19 @@ -// SPDX-License-Identifier: GPL-2.0 -/* System call table for x86-64. */ +// SPDX-License-Identifier: GPL-2.0-only +/* + * 64-bit system call dispatch + * + * Copyright (c) 2015 Andrew Lutomirski + * + * Based on asm and ptrace code by many authors. The code here originated + * in ptrace.c and signal.c. + */ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> #include <linux/syscalls.h> +#include <linux/entry-common.h> +#include <linux/nospec.h> #include <asm/syscall.h> #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); @@ -34,3 +43,93 @@ long x64_sys_call(const struct pt_regs *regs, unsigned int nr) default: return __x64_sys_ni_syscall(regs); } }; + +static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) +{ + /* + * Convert negative numbers to very high and thus out of range + * numbers for comparisons. + */ + unsigned int unr = nr; + + if (likely(unr < NR_syscalls)) { + unr = array_index_nospec(unr, NR_syscalls); + regs->ax = x64_sys_call(regs, unr); + return true; + } + return false; +} + +static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) +{ + /* + * Adjust the starting offset of the table, and convert numbers + * < __X32_SYSCALL_BIT to very high and thus out of range + * numbers for comparisons. + */ + unsigned int xnr = nr - __X32_SYSCALL_BIT; + + if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { + xnr = array_index_nospec(xnr, X32_NR_syscalls); + regs->ax = x32_sys_call(regs, xnr); + return true; + } + return false; +} + +/* Returns true to return using SYSRET, or false to use IRET */ +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) +{ + add_random_kstack_offset(); + nr = syscall_enter_from_user_mode(regs, nr); + + instrumentation_begin(); + + if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { + /* Invalid system call, but still a system call. */ + regs->ax = __x64_sys_ni_syscall(regs); + } + + instrumentation_end(); + syscall_exit_to_user_mode(regs); + + /* + * Check that the register state is valid for using SYSRET to exit + * to userspace. Otherwise use the slower but fully capable IRET + * exit path. + */ + + /* XEN PV guests always use the IRET path */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* SYSRET requires RCX == RIP and R11 == EFLAGS */ + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) + return false; + + /* CS and SS must match the values set in MSR_STAR */ + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) + return false; + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. + * + * TASK_SIZE_MAX covers all user-accessible addresses other than + * the deprecated vsyscall page. + */ + if (unlikely(regs->ip >= TASK_SIZE_MAX)) + return false; + + /* + * SYSRET cannot restore RF. It can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. + */ + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) + return false; + + /* Use SYSRET to exit to userspace */ + return true; +}
![]() |