On Sun, Jun 26, 2016 at 5:55 PM, Andy Lutomirski <luto@xxxxxxxxxx> wrote: > This allows x86_64 kernels to enable vmapped stacks. There are a > couple of interesting bits. > > First, x86 lazily faults in top-level paging entries for the vmalloc > area. This won't work if we get a page fault while trying to access > the stack: the CPU will promote it to a double-fault and we'll die. > To avoid this problem, probe the new stack when switching stacks and > forcibly populate the pgd entry for the stack when switching mms. > > Second, once we have guard pages around the stack, we'll want to > detect and handle stack overflow. > > I didn't enable it on x86_32. We'd need to rework the double-fault > code a bit and I'm concerned about running out of vmalloc virtual > addresses under some workloads. > > This patch, by itself, will behave somewhat erratically when the > stack overflows while RSP is still more than a few tens of bytes > above the bottom of the stack. Specifically, we'll get #PF and make > it to no_context and an oops without triggering a double-fault, and > no_context doesn't know about stack overflows. The next patch will > improve that case. > > Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx> > --- > arch/x86/Kconfig | 1 + > arch/x86/include/asm/switch_to.h | 28 +++++++++++++++++++++++++++- > arch/x86/kernel/traps.c | 32 ++++++++++++++++++++++++++++++++ > arch/x86/mm/tlb.c | 15 +++++++++++++++ > 4 files changed, 75 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index d9a94da0c29f..afdcf96ef109 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -92,6 +92,7 @@ config X86 > select HAVE_ARCH_TRACEHOOK > select HAVE_ARCH_TRANSPARENT_HUGEPAGE > select HAVE_EBPF_JIT if X86_64 > + select HAVE_ARCH_VMAP_STACK if X86_64 > select HAVE_CC_STACKPROTECTOR > select HAVE_CMPXCHG_DOUBLE > select HAVE_CMPXCHG_LOCAL > diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h > index 8f321a1b03a1..14e4b20f0aaf 100644 > --- a/arch/x86/include/asm/switch_to.h > +++ b/arch/x86/include/asm/switch_to.h > @@ -8,6 +8,28 @@ struct tss_struct; > void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, > struct tss_struct *tss); > > +/* This runs runs on the previous thread's stack. */ > +static inline void prepare_switch_to(struct task_struct *prev, > + struct task_struct *next) > +{ > +#ifdef CONFIG_VMAP_STACK > + /* > + * If we switch to a stack that has a top-level paging entry > + * that is not present in the current mm, the resulting #PF will > + * will be promoted to a double-fault and we'll panic. Probe > + * the new stack now so that vmalloc_fault can fix up the page > + * tables if needed. This can only happen if we use a stack > + * in vmap space. > + * > + * We assume that the stack is aligned so that it never spans > + * more than one top-level paging entry. > + * > + * To minimize cache pollution, just follow the stack pointer. > + */ > + READ_ONCE(*(unsigned char *)next->thread.sp); > +#endif > +} > + > #ifdef CONFIG_X86_32 > > #ifdef CONFIG_CC_STACKPROTECTOR > @@ -39,6 +61,8 @@ do { \ > */ \ > unsigned long ebx, ecx, edx, esi, edi; \ > \ > + prepare_switch_to(prev, next); \ > + \ > asm volatile("pushl %%ebp\n\t" /* save EBP */ \ > "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ > "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ > @@ -103,7 +127,9 @@ do { \ > * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL > * has no effect. > */ > -#define switch_to(prev, next, last) \ > +#define switch_to(prev, next, last) \ > + prepare_switch_to(prev, next); \ > + \ > asm volatile(SAVE_CONTEXT \ > "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ > "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ > diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c > index 00f03d82e69a..9cb7ea781176 100644 > --- a/arch/x86/kernel/traps.c > +++ b/arch/x86/kernel/traps.c > @@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) > DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) > DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) > > +#ifdef CONFIG_VMAP_STACK > +static void __noreturn handle_stack_overflow(const char *message, > + struct pt_regs *regs, > + unsigned long fault_address) > +{ > + printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", > + (void *)fault_address, current->stack, > + (char *)current->stack + THREAD_SIZE - 1); > + die(message, regs, 0); > + > + /* Be absolutely certain we don't return. */ > + panic(message); > +} > +#endif > + > #ifdef CONFIG_X86_64 > /* Runs on IST stack */ > dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) > { > static const char str[] = "double fault"; > struct task_struct *tsk = current; > +#ifdef CONFIG_VMAP_STACK > + unsigned long cr2; > +#endif > > #ifdef CONFIG_X86_ESPFIX64 > extern unsigned char native_irq_return_iret[]; > @@ -332,6 +350,20 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) > tsk->thread.error_code = error_code; > tsk->thread.trap_nr = X86_TRAP_DF; > > +#ifdef CONFIG_VMAP_STACK > + /* > + * If we overflow the stack into a guard page, the CPU will fail > + * to deliver #PF and will send #DF instead. CR2 will contain > + * the linear address of the second fault, which will be in the > + * guard page below the bottom of the stack. > + */ > + cr2 = read_cr2(); > + if ((unsigned long)tsk->stack - 1 - cr2 < PAGE_SIZE) > + handle_stack_overflow( > + "kernel stack overflow (double-fault)", > + regs, cr2); > +#endif Is there any other way to tell if this was from a page fault? If it wasn't a page fault then CR2 is undefined. -- Brian Gerst -- To unsubscribe from this list: send the line "unsubscribe linux-arch" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html