On 2021/9/16 下午6:02, Peter Zijlstra wrote: > On Thu, Sep 16, 2021 at 10:03:19AM +0200, Peter Zijlstra wrote: > >> Oh, I'm an idiot... yes it tries to read regs the stack, but clearly >> that won't work for the guard page. > > OK, extended it to also cover task and IRQ stacks. get_stack_info() > doesn't seem to know about SOFTIRQ stacks on 64bit, might have to look > into that next. > > Andy, what's the story with page_fault_oops(), according to the comment > in exc_double_fault() actual stack overflows will always hit #DF. Just give this one a test, still not working properly... [ 51.016033][ C0] traps: PANIC: double fault, error_code: 0x0 [ 51.016047][ C0] double fault: 0000 [#1] SMP PTI [ 51.016054][ C0] CPU: 0 PID: 761 Comm: a.out Not tainted 5.14.0-next-20210913+ #543 [ 51.016061][ C0] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 51.016065][ C0] RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70 [ 51.016079][ C0] Code: 48 03 43 28 48 8b 0c 24 bb 01 00 00 00 4c 29 f0 48 39 c8 48 0f 47 c1 49 89 45 08 e9 48 ff ff ff 66 2e 0f 1f 84 00 00 00 00 00 <55> 53 e8 09 20 f2 ff 48 c7 c2 20 4d 03 00 65 48 03 15 5a 3b d2 7e [ 51.016086][ C0] RSP: 0018:fffffe000000b000 EFLAGS: 00010046 [ 51.016093][ C0] RAX: 0000000080120008 RBX: fffffe000000b050 RCX: 0000000000000000 [ 51.016097][ C0] RDX: ffff888106c3c300 RSI: ffffffff81269031 RDI: 000000000000001c [ 51.016102][ C0] RBP: 000000000000001c R08: 0000000000000001 R09: 0000000000000000 [ 51.016106][ C0] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 51.016109][ C0] R13: fffffe000000b044 R14: 0000000000000001 R15: 0000000000000001 [ 51.016113][ C0] FS: 00007f0cfd961740(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 [ 51.016120][ C0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 51.016124][ C0] CR2: fffffe000000aff8 CR3: 0000000105ecc001 CR4: 00000000003606f0 [ 51.016129][ C0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 51.016132][ C0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 51.016136][ C0] Call Trace: [ 51.016139][ C0] <TASK> [ 51.016141][ C0] </TASK> [ 51.016144][ C0] Modules linked in: [ 51.042436][ C0] ---[ end trace 5c102ce76b073dcf ]--- [ 51.042440][ C0] RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70 [ 51.042450][ C0] Code: 48 03 43 28 48 8b 0c 24 bb 01 00 00 00 4c 29 f0 48 39 c8 48 0f 47 c1 49 89 45 08 e9 48 ff ff ff 66 2e 0f 1f 84 00 00 00 00 00 <55> 53 e8 09 20 f2 ff 48 c7 c2 20 4d 03 00 65 48 03 15 5a 3b d2 7e [ 51.042457][ C0] RSP: 0018:fffffe000000b000 EFLAGS: 00010046 [ 51.042462][ C0] RAX: 0000000080120008 RBX: fffffe000000b050 RCX: 0000000000000000 [ 51.042466][ C0] RDX: ffff888106c3c300 RSI: ffffffff81269031 RDI: 000000000000001c [ 51.042470][ C0] RBP: 000000000000001c R08: 0000000000000001 R09: 0000000000000000 [ 51.042479][ C0] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 51.042483][ C0] R13: fffffe000000b044 R14: 0000000000000001 R15: 0000000000000001 [ 51.042487][ C0] FS: 00007f0cfd961740(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 [ 51.042493][ C0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 51.042497][ C0] CR2: fffffe000000aff8 CR3: 0000000105ecc001 CR4: 00000000003606f0 [ 51.042501][ C0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 51.042505][ C0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 51.042510][ C0] Kernel panic - not syncing: Fatal exception in interrupt [ 51.042917][ C0] Kernel Offset: disabled Regards, Michael Wang > > --- > diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h > index 3d52b094850a..c4e92462c2b4 100644 > --- a/arch/x86/include/asm/cpu_entry_area.h > +++ b/arch/x86/include/asm/cpu_entry_area.h > @@ -61,6 +61,9 @@ enum exception_stack_ordering { > #define CEA_ESTACK_OFFS(st) \ > offsetof(struct cea_exception_stacks, st## _stack) > > +#define CEA_EGUARD_OFFS(st) \ > + offsetof(struct cea_exception_stacks, st## _stack_guard) > + > #define CEA_ESTACK_PAGES \ > (sizeof(struct cea_exception_stacks) / PAGE_SIZE) > > diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h > index f248eb2ac2d4..8ff346579330 100644 > --- a/arch/x86/include/asm/stacktrace.h > +++ b/arch/x86/include/asm/stacktrace.h > @@ -14,13 +14,14 @@ > #include <asm/switch_to.h> > > enum stack_type { > - STACK_TYPE_UNKNOWN, > + STACK_TYPE_UNKNOWN = 0, > STACK_TYPE_TASK, > STACK_TYPE_IRQ, > STACK_TYPE_SOFTIRQ, > STACK_TYPE_ENTRY, > STACK_TYPE_EXCEPTION, > STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, > + STACK_TYPE_GUARD = 0x80, > }; > > struct stack_info { > @@ -31,6 +32,15 @@ struct stack_info { > bool in_task_stack(unsigned long *stack, struct task_struct *task, > struct stack_info *info); > > +static __always_inline bool in_stack_guard(void *addr, void *begin, void *end) > +{ > +#ifdef CONFIG_VMAP_STACK > + if (addr > (begin - PAGE_SIZE)) > + return true; > +#endif > + return false; > +} > + > bool in_entry_stack(unsigned long *stack, struct stack_info *info); > > int get_stack_info(unsigned long *stack, struct task_struct *task, > diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c > index ea4fe192189d..91b406fe2a39 100644 > --- a/arch/x86/kernel/dumpstack.c > +++ b/arch/x86/kernel/dumpstack.c > @@ -32,12 +32,19 @@ static struct pt_regs exec_summary_regs; > bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task, > struct stack_info *info) > { > - unsigned long *begin = task_stack_page(task); > - unsigned long *end = task_stack_page(task) + THREAD_SIZE; > - > - if (stack < begin || stack >= end) > + void *begin = task_stack_page(task); > + void *end = begin + THREAD_SIZE; > + int type = STACK_TYPE_TASK; > + > + if ((void *)stack < begin || (void *)stack >= end) { > + if (in_stack_guard(stack, begin, end)) { > + type |= STACK_TYPE_GUARD; > + goto fill_info; > + } > return false; > + } > > +fill_info: > info->type = STACK_TYPE_TASK; > info->begin = begin; > info->end = end; > @@ -50,14 +57,20 @@ bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task, > bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info) > { > struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); > - > + int type = STACK_TYPE_ENTRY; > void *begin = ss; > void *end = ss + 1; > > - if ((void *)stack < begin || (void *)stack >= end) > + if ((void *)stack < begin || (void *)stack >= end) { > + if (in_stack_guard(stack, begin, end)) { > + type |= STACK_TYPE_GUARD; > + goto fill_info; > + } > return false; > + } > > - info->type = STACK_TYPE_ENTRY; > +fill_info: > + info->type = type; > info->begin = begin; > info->end = end; > info->next_sp = NULL; > diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c > index 5601b95944fa..3634bdf9ab36 100644 > --- a/arch/x86/kernel/dumpstack_64.c > +++ b/arch/x86/kernel/dumpstack_64.c > @@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_type type) > { > BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); > > + if (type == STACK_TYPE_TASK) > + return "TASK"; > + > if (type == STACK_TYPE_IRQ) > return "IRQ"; > > + if (type == STACK_TYPE_SOFTIRQ) > + return "SOFTIRQ"; > + > if (type == STACK_TYPE_ENTRY) { > /* > * On 64-bit, we have a generic entry stack that we > @@ -63,6 +69,11 @@ struct estack_pages { > }; > > #define EPAGERANGE(st) \ > + [PFN_DOWN(CEA_EGUARD_OFFS(st))] = { \ > + .offs = CEA_EGUARD_OFFS(st), \ > + .size = PAGE_SIZE, \ > + .type = STACK_TYPE_GUARD + \ > + STACK_TYPE_EXCEPTION + ESTACK_ ##st, }, \ > [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ > PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ > .offs = CEA_ESTACK_OFFS(st), \ > @@ -111,7 +122,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac > k = (stk - begin) >> PAGE_SHIFT; > /* Lookup the page descriptor */ > ep = &estack_pages[k]; > - /* Guard page? */ > + /* unknown entry */ > if (!ep->size) > return false; > > @@ -122,7 +133,12 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac > info->type = ep->type; > info->begin = (unsigned long *)begin; > info->end = (unsigned long *)end; > - info->next_sp = (unsigned long *)regs->sp; > + info->next_sp = NULL; > + > + /* Can't read regs from a guard page. */ > + if (!(ep->type & STACK_TYPE_GUARD)) > + info->next_sp = (unsigned long *)regs->sp; > + > return true; > } > > @@ -130,6 +146,7 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info > { > unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); > unsigned long *begin; > + int type = STACK_TYPE_IRQ; > > /* > * @end points directly to the top most stack entry to avoid a -8 > @@ -144,19 +161,27 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info > * final operation is 'popq %rsp' which means after that RSP points > * to the original stack and not to @end. > */ > - if (stack < begin || stack >= end) > + if (stack < begin || stack >= end) { > + if (in_stack_guard(stack, begin, end)) { > + type |= STACK_TYPE_GUARD; > + goto fill_info; > + } > return false; > + } > > - info->type = STACK_TYPE_IRQ; > +fill_info: > + info->type = type; > info->begin = begin; > info->end = end; > + info->next_sp = NULL; > > /* > * The next stack pointer is stored at the top of the irq stack > * before switching to the irq stack. Actual stack entries are all > * below that. > */ > - info->next_sp = (unsigned long *)*(end - 1); > + if (!(type & STACK_TYPE_GUARD)) > + info->next_sp = (unsigned long *)*(end - 1); > > return true; > } > @@ -193,6 +218,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, > if (!get_stack_info_noinstr(stack, task, info)) > goto unknown; > > + if (info->type & STACK_TYPE_GUARD) > + goto unknown; > + > /* > * Make sure we don't iterate through any given stack more than once. > * If it comes up a second time then there's something wrong going on: > diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c > index a58800973aed..80f6d8d735eb 100644 > --- a/arch/x86/kernel/traps.c > +++ b/arch/x86/kernel/traps.c > @@ -353,6 +353,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) > > #ifdef CONFIG_VMAP_STACK > unsigned long address = read_cr2(); > + struct stack_info info; > #endif > > #ifdef CONFIG_X86_ESPFIX64 > @@ -455,9 +456,11 @@ DEFINE_IDTENTRY_DF(exc_double_fault) > * stack even if the actual trigger for the double fault was > * something else. > */ > - if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) { > - handle_stack_overflow("kernel stack overflow (double-fault)", > - regs, address); > + if (get_stack_info_noinstr((void *)address, current, &info) && > + info.type & STACK_TYPE_GUARD) { > + const char *name = stack_type_name(info.type & ~STACK_TYPE_GUARD); > + pr_emerg("BUG: %s stack guard hit at %p (stack is %p..%p)\n", > + name, (void *)address, info.begin, info.end); > } > #endif > > @@ -708,7 +711,9 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r > sp = regs->sp; > stack = (unsigned long *)sp; > > - if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || > + if (!get_stack_info_noinstr(stack, current, &info) || > + info.type & STACK_TYPE_GUARD || > + info.type == STACK_TYPE_ENTRY || > info.type >= STACK_TYPE_EXCEPTION_LAST) > sp = __this_cpu_ist_top_va(VC2); > >