From: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx> See the comments in the cover-letter. They will be moved into the code and changelog here when improved. Signed-off-by: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx> --- arch/x86/entry/Makefile | 3 + arch/x86/entry/entry_64.S | 193 ++++++++++++++++++++ arch/x86/entry/ist_entry.c | 299 +++++++++++++++++++++++++++++++ arch/x86/kernel/asm-offsets_64.c | 7 + arch/x86/kernel/callthunks.c | 2 + tools/objtool/check.c | 7 +- 6 files changed, 510 insertions(+), 1 deletion(-) create mode 100644 arch/x86/entry/ist_entry.c diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index ca2fe186994b..7cc1254ca519 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -8,11 +8,14 @@ UBSAN_SANITIZE := n KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_ist_entry.o = $(CC_FLAGS_FTRACE) $(RETHUNK_CFLAGS) CFLAGS_common.o += -fno-stack-protector +CFLAGS_ist_entry.o += -fno-stack-protector obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o obj-y += common.o +obj-$(CONFIG_X86_64) += ist_entry.o obj-y += vdso/ obj-y += vsyscall/ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 49ddc4dd3117..50a24cc83581 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -443,6 +443,184 @@ SYM_CODE_END(\asmsym) idtentry \vector asm_\cfunc \cfunc has_error_code=0 .endm +/** + * idtentry_ist - Macro to generate entry stubs for IST exceptions except #DF + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called when it occurs in kernel + * @user_cfunc: C function to be called when it occurs in userspace + * @has_error_code: Hardware pushed error code on stack + * @stack_offset: Offset of the IST stack top in struct cea_exception_stacks + * + * The macro emits code to set up the kernel context for IST exceptions. + * + * From the hardware entry of the event to the SYM_INNER_LABEL(commit_\asmsym) + * is atomic-IST-entry (note: atomic-IST-entry is from the hardware entry, + * not merely from the first instruction of this macro). + * + * The atomic-IST-entry pushes pt_regs and copies the pt_regs to the IST + * main stack, and switches to it. If the atomic-IST-entry is interrupted + * by another IST event (except #DF), the new atomic-IST-entry will + * replicate the interrupted one as if every atomic-IST-entry is atomic. + * + * See the comments in entry64.c. + * + * When the cpu is on any IST stack or the IST main stack, %rsp can not be + * switched off except being interrupted by any IST exception or totally + * switching off (no usable data left). + * + * If the entry comes from user space, it turns to use the normal entry + * path finally on its kernel stack including the return to user space + * work and preemption checks on exit. The macro idtentry_body ensures + * the IST main stack is totally switched off (no usable data left) at + * the same time it switches to the kernel stack.. + * + * If hits in kernel mode then it needs to go through the paranoid + * entry as the exception can hit any random state. No preemption + * check on exit to keep the paranoid path simple. + */ +.macro idtentry_ist vector asmsym cfunc user_cfunc has_error_code:req, stack_offset:req +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS offset=\has_error_code*8 + ENDBR + + /* + * Clear X86_EFLAGS_AC, X86_EFLAGS_DF and set a default ORIG_RAX. + * + * The code setting ORIG_RAX will not be replicated if interrupted. + */ + ASM_CLAC + cld + + .if \has_error_code == 0 + pushq $-1 /* ORIG_RAX: no syscall to restart */ + .endif + + /* + * No register can be touched except %rsp,%rflags,%rip before + * pushing all the registers. It is indispensable for nested + * atomic-IST-entry to replicate pushing the registers. + */ + PUSH_REGS + + /* + * Finished pushing register, all registers can be touched by now. + * + * Clear registers for the C function ist_copy_regs_to_main_stack() + * and the handler to avoid any possible exploitation of any + * speculation attack. + */ + CLEAR_REGS + + /* + * Copy the pt_regs to the IST main stack including the pt_regs of + * the interrupted atomic-IST-entris, if any, by replicating. + */ + movq %rsp, %rdi /* pt_regs pointer on its own IST stack */ + leaq PTREGS_SIZE-\stack_offset(%rsp), %rsi /* struct cea_exception_stacks pointer */ + call ist_copy_regs_to_main_stack + + /* + * Commit stage. + */ +SYM_INNER_LABEL(start_commit_\asmsym, SYM_L_GLOBAL) + /* + * Switches to the IST main stack. Before the switching is done, + * %rax is the copied pt_regs pointer in IST main stack. + */ + movq %rax, %rsp + + /* + * The label should be immediate after the instruction that switches + * the stack since there is code assuming there is only one single + * instruction in the commit stage and the code assumes "%rsp in the + * IST main stack is also the sign of ending a atomic-IST-entry". + * (The code will be removed in future when %rip-based identifying + * is added.) + */ +SYM_INNER_LABEL(commit_\asmsym, SYM_L_GLOBAL) + + /* + * Now, it is on the IST main stack. For the whole kernel, the entries + * of the IST exceptions can be seen from here because the inside + * of the atomic-IST-entry can not be seen from the whole kernel + * except in the atomic-IST-entry or #DF. + */ + UNWIND_HINT_REGS + ENCODE_FRAME_POINTER + + /* + * The code setting ORIG_RAX will not be replicated if interrupted. + * So redo it here. + */ + .if \has_error_code == 0 + movq $-1, ORIG_RAX(%rsp) /* ORIG_RAX: no syscall to restart */ + .endif + + /* + * If the entry is from userspace, switch stacks and treat it as + * a normal entry. + */ + testb $3, CS(%rsp) + jnz .Lfrom_usermode_switch_stack_\@ + + /* + * paranoid_entry returns GS/CR3/SPEC_CTL information for + * paranoid_exit in RBX/R14/R15. + */ + call paranoid_entry + + movq %rsp, %rdi /* pt_regs pointer */ + .if \has_error_code == 1 + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .endif + call \cfunc + + jmp paranoid_exit + +.Lfrom_usermode_switch_stack_\@: + /* Switch context: GS_BASE, CR3, SPEC_CTL. */ + swapgs + FENCE_SWAPGS_USER_ENTRY + /* We have user CR3. Change to kernel CR3. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + IBRS_ENTER + UNTRAIN_RET + + /* Put the pt_regs onto the kernel task stack. */ + movq %rsp, %rdi /* arg0 = pt_regs pointer */ + call sync_regs + + /* + * Switch to the kernel task stack and use the user entry point. + * + * When from the user mode, the procedure has to atomically switches + * off the TSS-configured IST stacks too, so it switches to the IST + * main stack first, and then switches off the IST main stack in atomic + * fashion: when %rsp leaves the IST main stack, the IST main stack is + * totally free. + */ + movq %rax, %rsp + UNWIND_HINT_REGS + ENCODE_FRAME_POINTER + + movq %rsp, %rdi /* pt_regs pointer into 1st argument*/ + .if \has_error_code == 1 + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .endif + call \user_cfunc + + /* For some configurations \user_cfunc ends up being a noreturn. */ + REACHABLE + + jmp error_return + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm + /** * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB * @vector: Vector number @@ -586,8 +764,23 @@ SYM_CODE_END(\asmsym) */ .macro idtentry_df vector asmsym cfunc SYM_CODE_START(\asmsym) + + /* + * This unwind-hint is incorect if it is the soft double fault rasied + * from ist_double_fault(). It doesn't matter since it is unrecoverable + * double fault. + */ UNWIND_HINT_IRET_REGS offset=8 ENDBR + + /* + * Set %rsp = %rsp - 8 if it is the soft double fault raisied from + * ist_double_fault(). The CPU doesn't push an error code in the case + * since it is injected by an INT instruction. + */ + btr $3, %rsp + UNWIND_HINT_IRET_REGS offset=8 + ASM_CLAC cld diff --git a/arch/x86/entry/ist_entry.c b/arch/x86/entry/ist_entry.c new file mode 100644 index 000000000000..e1b06306ac51 --- /dev/null +++ b/arch/x86/entry/ist_entry.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022-2023 Lai Jiangshan, Ant Group + * + * Handle entries and exits for hardware traps and faults. + * + * It is as low level as entry_64.S and its code can be running in the + * environments that the GS base is a user controlled value, or the CR3 + * is the PTI user CR3 or both. + */ +#include <asm/traps.h> + +#define IST_DOUBLE_FAULT_VECTOR 8 + +static __always_inline void ist_double_fault(void) +{ + asm volatile ("int $" __stringify(IST_DOUBLE_FAULT_VECTOR)); +} + +#define IN_CEA_ESTACK(ceastp, name, sp) \ + ((CEA_ESTACK_BOT(ceastp, name) <= (sp)) && \ + ((sp) < CEA_ESTACK_TOP(ceastp, name))) + +struct ist_ctx { + const struct pt_regs *regs; + unsigned long commit_ip; +}; + +#define DEFINE_IDENTIFY_IST(stack_name, sym_name, is_enabled) \ +extern char commit_asm_exc_##sym_name[]; \ +static __always_inline bool identify_ist_##sym_name( \ + const struct pt_regs *regs, struct cea_exception_stacks *stacks,\ + struct ist_ctx *ctx) \ +{ \ + if (!(is_enabled)) \ + return false; \ + if (!IN_CEA_ESTACK(stacks, stack_name, regs->sp)) \ + return false; \ + ctx->regs = (struct pt_regs *)CEA_ESTACK_TOP(stacks, stack_name) - 1; \ + ctx->commit_ip = (unsigned long)commit_asm_exc_##sym_name; \ + return true; \ +} + +DEFINE_IDENTIFY_IST(NMI, nmi, false) +DEFINE_IDENTIFY_IST(DB, debug, false) +DEFINE_IDENTIFY_IST(MCE, machine_check, false) +DEFINE_IDENTIFY_IST(VC, vmm_communication, false) + +static __always_inline bool identify_ist( + const struct pt_regs *regs, struct cea_exception_stacks *stacks, + struct ist_ctx *ctx) +{ + return identify_ist_nmi(regs, stacks, ctx) || + identify_ist_debug(regs, stacks, ctx) || + identify_ist_machine_check(regs, stacks, ctx) || + identify_ist_vmm_communication(regs, stacks, ctx); +} + +/* + * identify if an interrupted atomic-IST-entry had successfully saved + * the general registers onto its IST stack. + * + * Generally, the outmost atomic-IST-entry had likely successfully saved + * the general registers. If not, there must be one of the nested + * atomic-IST-entry had saved the general registers of the context that + * the outmost atomic-IST-entry had interrupted. + * + * Arguments: + * @nested: the nested atomic-IST-entry who had interrupted @interrupted + * @interrupted: the interrupted atomic-IST-entry. + * + * Returns: + * true: the interrupted atomic-IST-entry had saved the general registers. + * false: the interrupted atomic-IST-entry had not yet saved the general registers. + */ +static __always_inline +bool identify_if_gp_registers_saved(const struct pt_regs *nested, const struct pt_regs *interrupted) +{ + return nested->sp <= (unsigned long)(void *)interrupted; +} + +static __always_inline +void copy_regs_exception_head(struct pt_regs *target, const struct pt_regs *from) +{ + target->ss = from->ss; + target->sp = from->sp; + target->flags = from->flags; + target->cs = from->cs; + target->ip = from->ip; + target->orig_ax = from->orig_ax; +} + +static __always_inline +void copy_regs_general_registers(struct pt_regs *target, const struct pt_regs *from) +{ + target->di = from->di; + target->si = from->si; + target->dx = from->dx; + target->cx = from->cx; + target->ax = from->ax; + target->r8 = from->r8; + target->r9 = from->r9; + target->r10 = from->r10; + target->r11 = from->r11; + target->bx = from->bx; + target->bp = from->bp; + target->r12 = from->r12; + target->r13 = from->r13; + target->r14 = from->r14; + target->r15 = from->r15; +} + +/* + * Do the work as the outmost atomic-IST-entry to copy the supposed pt_regs + * of the interrupted context to the IST main stack. (If the ongoing + * atomic-IST-entry is the outmost one, the work is literally doing copy as + * the outmost, if not, the work is replicating the outmost.) + * + * The hardware-entry of the outmost atomic-IST-entry has already saved the + * exception head of the pt_regs. If the outmost atomic-IST-entry was + * unfortunately interrupted before fully saving all the general registers, + * the general registers are untouched and must be saved by one of the + * consequent nested atomic-IST-entries. The identifying code can just + * examine all the nested atomic-IST-entries to find which one has saved + * the general registers. + */ +static __always_inline +void copy_outmost(struct pt_regs *target, const struct pt_regs *outmost, const struct pt_regs *gp) +{ + copy_regs_exception_head(target, outmost); + copy_regs_general_registers(target, gp); +} + +/* + * Replicate the interrupted atomic-IST-entry's CLAC and CLD in the ASM + * code. Even SMAP is not enabled, CLAC is replicated unconditionally + * since doing so has no harm. + */ +static __always_inline void replicate_clac_cld(struct pt_regs *target) +{ + target->flags &= ~(unsigned long)(X86_EFLAGS_AC | X86_EFLAGS_DF); +} + +/* Replicate the interrupted atomic-IST-entry's CLEAR_REGS macro. */ +static __always_inline void replicate_clear_regs(struct pt_regs *target) +{ + target->di = 0; + target->si = 0; + target->dx = 0; + target->cx = 0; + target->ax = 0; + target->r8 = 0; + target->r9 = 0; + target->r10 = 0; + target->r11 = 0; + target->bx = 0; + target->bp = 0; + target->r12 = 0; + target->r13 = 0; + target->r14 = 0; + target->r15 = 0; +} + +/* + * Replicate the action that the interrupted atomic-IST-entry's + * ist_copy_regs_to_main_stack() clobbers caller-saved registers + */ +static __always_inline void replicate_func_clobber(struct pt_regs *target) +{ + /* nothing needs to be done. */ +} + +/* + * Replicate the copy operation in the interrupted atomic-IST-entry's + * ist_copy_regs_to_main_stack() + */ +static __always_inline void replicate_func_copy(struct pt_regs *target) +{ + /* + * To avoid recursive functions calls with __always_inline, the + * copy operation for the interrupted atomic-IST-entry has been + * done in the caller of copy_nested(). Nothing need to be done. + */ +} + +#define IST_FRAME_SIZE ALIGN(sizeof(struct pt_regs), 16) + +/* + * Replicate the return result of the interrupted atomic-IST-entry's + * ist_copy_regs_to_main_stack() in %rax and the commit operation. + */ +static __always_inline void replicate_func_result_and_commit(struct pt_regs *target, unsigned long commit_ip) +{ + void *target_of_interrupted = (void *)target + IST_FRAME_SIZE; + + /* return result in %rax */ + target->ax = (unsigned long)target_of_interrupted; + /* move %rax, %rsp */ + target->sp = (unsigned long)target_of_interrupted; + /* the %rip advances to commit point */ + target->ip = commit_ip; +} + +/* + * Do the work as a nested atomic-IST-entry to copy the supposed pt_regs + * of the interrupted context to the IST main stack. + * + * The hardware-entry of the nested atomic-IST-entry has already saved + * the exception head of the pt_regs of the interrupted context (inside + * the interrupted atomic-IST-entry). To maintain the atomic attribute + * of the atomic-IST-entry, the copy_nested() (of the ongoing nested + * atomic-IST-entry) has to replicate all that the interrupted + * atomic-IST-entries should have been done till the commit point and + * copy the supposed saved context (pt_regs). + * + * To avoid touching any saved pt_regs, the replicating is actually + * directly applied on the target pt_regs. + */ +static __always_inline +void copy_nested(struct pt_regs *target, const struct pt_regs *nested, unsigned long commit_ip) +{ + copy_regs_exception_head(target, nested); + replicate_clac_cld(target); + replicate_clear_regs(target); + replicate_func_clobber(target); + replicate_func_copy(target); + replicate_func_result_and_commit(target, commit_ip); +} + +asmlinkage __visible __noinstr_section(".entry.text") +struct pt_regs *ist_copy_regs_to_main_stack( + const struct pt_regs *regs, struct cea_exception_stacks *stacks) +{ + unsigned long ist_main_sp = CEA_ESTACK_TOP(stacks, IST); + struct ist_ctx ist_ctx[8]; + const struct pt_regs *gp_saved; + struct pt_regs *target; + int nr_entries, i; + + /* + * Identify all of the atomic-IST-entris. + * + * The current ongoing atomic-IST-entry doesn't need to be identified, + * but is also put in the @ist_ctx[0] for later convenience. + * + * The for-loop identifies what the context @regs has interrupted is. + * It travels back to the outmost atomic-IST-entry. + * + * Result: + * Identified result is put in ist_ctx[i]. + * ist_ctx[0] is the current ongoing atomic-IST-entry. + * ist_ctx[nr_entries-1] is the outmost atomic-IST-entry. + * gp_saved is the atomic-IST-entry that has saved the general registers. + */ + ist_ctx[0].regs = regs; + ist_ctx[0].commit_ip = -1; /* unused */ + nr_entries = 1; + gp_saved = regs; + for (;;) { + if (user_mode((struct pt_regs *)regs)) + break; + if (ip_within_syscall_gap((struct pt_regs *)regs)) + break; + if (!identify_ist(regs, stacks, &ist_ctx[nr_entries])) { + /* locate the top of copying target pt_regs */ + if (IN_CEA_ESTACK(stacks, IST, regs->sp)) + ist_main_sp = ALIGN_DOWN(regs->sp, 16); + break; + } + if (identify_if_gp_registers_saved(regs, ist_ctx[nr_entries].regs)) + gp_saved = ist_ctx[nr_entries].regs; + regs = ist_ctx[nr_entries].regs; + nr_entries++; + if (nr_entries >= ARRAY_SIZE(ist_ctx)) + ist_double_fault(); + } + + if (!IN_CEA_ESTACK(stacks, IST, ist_main_sp - IST_FRAME_SIZE * nr_entries)) + ist_double_fault(); + + /* + * Copy the saved pt_regs to the IST main stack. + * + * For each atomic-IST-entry including the interrupted ones and + * the current ongoing one, calls either copy_outmost() or copy_nested() + * to copy the pt_regs of what should have been saved, by replicating + * if needed, to the IST main stack. + */ + ist_main_sp -= IST_FRAME_SIZE; + target = (void *)ist_main_sp; + copy_outmost(target, ist_ctx[nr_entries - 1].regs, gp_saved); + for (i = nr_entries - 2; unlikely(i >= 0); i--) { + ist_main_sp -= IST_FRAME_SIZE; + target = (void *)ist_main_sp; + copy_nested(target, ist_ctx[i].regs, ist_ctx[i+1].commit_ip); + } + + return target; +} diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bb65371ea9df..f861a56c0002 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -60,5 +60,12 @@ int main(void) OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary); BLANK(); #endif + + DEFINE(CEA_stacks_NMI, offsetofend(struct cea_exception_stacks, NMI_stack)); + DEFINE(CEA_stacks_DB, offsetofend(struct cea_exception_stacks, DB_stack)); + DEFINE(CEA_stacks_MCE, offsetofend(struct cea_exception_stacks, MCE_stack)); + DEFINE(CEA_stacks_VC, offsetofend(struct cea_exception_stacks, VC_stack)); + BLANK(); + return 0; } diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index ffea98f9064b..e756c89996d8 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -123,6 +123,8 @@ static bool skip_addr(void *dest) { if (dest == error_entry) return true; + if (dest == ist_copy_regs_to_main_stack) + return true; if (dest == paranoid_entry) return true; if (dest == xen_error_entry) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index f937be1afe65..8dfa627d4b41 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3998,6 +3998,11 @@ static int validate_unret(struct objtool_file *file) return warnings; } +static bool in_ist_entry(struct instruction *insn) +{ + return !strcmp(insn->sym->name, "ist_copy_regs_to_main_stack"); +} + static int validate_retpoline(struct objtool_file *file) { struct instruction *insn; @@ -4016,7 +4021,7 @@ static int validate_retpoline(struct objtool_file *file) continue; if (insn->type == INSN_RETURN) { - if (opts.rethunk) { + if (opts.rethunk && !in_ist_entry(insn)) { WARN_FUNC("'naked' return found in RETHUNK build", insn->sec, insn->offset); } else -- 2.19.1.6.gb485710b