When a system call is required to run in an isolated context, the CR3 will be switched to the SCI page table a per-cpu variable will contain and offset from the original CR3. This offset is used to switch back to the full kernel context when a trap occurs during isolated system call. Signed-off-by: Mike Rapoport <rppt@xxxxxxxxxxxxx> --- arch/x86/entry/common.c | 61 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/process_64.c | 5 ++++ kernel/exit.c | 3 +++ 3 files changed, 69 insertions(+) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 7bc105f..8f2a6fd 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -25,12 +25,14 @@ #include <linux/uprobes.h> #include <linux/livepatch.h> #include <linux/syscalls.h> +#include <linux/sci.h> #include <asm/desc.h> #include <asm/traps.h> #include <asm/vdso.h> #include <linux/uaccess.h> #include <asm/cpufeature.h> +#include <asm/tlbflush.h> #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> @@ -269,6 +271,50 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) } #ifdef CONFIG_X86_64 + +#ifdef CONFIG_SYSCALL_ISOLATION +static inline bool sci_required(unsigned long nr) +{ + return false; +} + +static inline unsigned long sci_syscall_enter(unsigned long nr) +{ + unsigned long sci_cr3, kernel_cr3; + unsigned long asid; + + kernel_cr3 = __read_cr3(); + asid = kernel_cr3 & ~PAGE_MASK; + + sci_cr3 = build_cr3(current->sci->pgd, 0) & PAGE_MASK; + sci_cr3 |= (asid | (1 << X86_CR3_SCI_PCID_BIT)); + + current->in_isolated_syscall = 1; + current->sci->cr3_offset = kernel_cr3 - sci_cr3; + + this_cpu_write(cpu_sci.sci_syscall, 1); + this_cpu_write(cpu_sci.sci_cr3_offset, current->sci->cr3_offset); + + write_cr3(sci_cr3); + + return kernel_cr3; +} + +static inline void sci_syscall_exit(unsigned long cr3) +{ + if (cr3) { + write_cr3(cr3); + current->in_isolated_syscall = 0; + this_cpu_write(cpu_sci.sci_syscall, 0); + sci_clear_data(); + } +} +#else +static inline bool sci_required(unsigned long nr) { return false; } +static inline unsigned long sci_syscall_enter(unsigned long nr) { return 0; } +static inline void sci_syscall_exit(unsigned long cr3) {} +#endif + __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; @@ -286,10 +332,25 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) */ nr &= __SYSCALL_MASK; if (likely(nr < NR_syscalls)) { + unsigned long sci_cr3 = 0; + nr = array_index_nospec(nr, NR_syscalls); + + if (sci_required(nr)) { + int err = sci_init(current); + + if (err) { + regs->ax = err; + goto err_return_from_syscall; + } + sci_cr3 = sci_syscall_enter(nr); + } + regs->ax = sys_call_table[nr](regs); + sci_syscall_exit(sci_cr3); } +err_return_from_syscall: syscall_return_slowpath(regs); } #endif diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6a62f4a..b8aa624 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,6 +55,8 @@ #include <asm/resctrl_sched.h> #include <asm/unistd.h> #include <asm/fsgsbase.h> +#include <asm/sci.h> + #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include <asm/unistd_32_ia32.h> @@ -581,6 +583,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_to_extra(prev_p, next_p); + /* update syscall isolation per-cpu data */ + sci_switch_to(next_p); + #ifdef CONFIG_XEN_PV /* * On Xen PV, IOPL bits in pt_regs->flags have no effect, and diff --git a/kernel/exit.c b/kernel/exit.c index 2639a30..8e81353 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -62,6 +62,7 @@ #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> +#include <linux/sci.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -859,6 +860,8 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + sci_exit(tsk); + exit_mm(); if (group_dead) -- 2.7.4