Hi, On 22.11.2021 10:28, Ard Biesheuvel wrote: > Wire up the generic support for managing task stack allocations via vmalloc, > and implement the entry code that detects whether we faulted because of a > stack overrun (or future stack overrun caused by pushing the pt_regs array) > > While this adds a fair amount of tricky entry asm code, it should be > noted that it only adds a TST + branch to the svc_entry path. The code > implementing the non-trivial handling of the overflow stack is emitted > out-of-line into the .text section. > > Since on ARM, we rely on do_translation_fault() to keep PMD level page > table entries that cover the vmalloc region up to date, we need to > ensure that we don't hit such a stale PMD entry when accessing the > stack. So we do a dummy read from the new stack while still running from > the old one on the context switch path, and bump the vmalloc_seq counter > when PMD level entries in the vmalloc range are modified, so that the MM > switch fetches the latest version of the entries. > > Note that we need to increase the per-mode stack by 1 word, to gain some > space to stash a GPR until we know it is safe to touch the stack. > However, due to the cacheline alignment of the struct, this does not > actually increase the memory footprint of the struct stack array at all. > > Signed-off-by: Ard Biesheuvel <ardb@xxxxxxxxxx> > Tested-by: Keith Packard <keithpac@xxxxxxxxxx> This patch landed recently in linux-next 20211220 as commit a1c510d0adc6 ("ARM: implement support for vmap'ed stacks"). Sadly it breaks suspend/resume operation on all ARM 32bit Exynos SoCs. Probably the suspend/resume related code must be updated somehow (it partially works on physical addresses and disabled MMU), but I didn't analyze it yet. If you have any hints, let me know. > --- > arch/arm/Kconfig | 1 + > arch/arm/include/asm/page.h | 4 + > arch/arm/include/asm/thread_info.h | 8 ++ > arch/arm/kernel/entry-armv.S | 97 +++++++++++++++++++- > arch/arm/kernel/entry-header.S | 37 ++++++++ > arch/arm/kernel/irq.c | 9 +- > arch/arm/kernel/setup.c | 8 +- > arch/arm/kernel/sleep.S | 8 ++ > arch/arm/kernel/traps.c | 80 +++++++++++++++- > arch/arm/kernel/unwind.c | 3 +- > arch/arm/kernel/vmlinux.lds.S | 4 +- > 11 files changed, 244 insertions(+), 15 deletions(-) > > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig > index b1eba1b4168c..7a0853bd298f 100644 > --- a/arch/arm/Kconfig > +++ b/arch/arm/Kconfig > @@ -127,6 +127,7 @@ config ARM > select RTC_LIB > select SYS_SUPPORTS_APM_EMULATION > select THREAD_INFO_IN_TASK if CURRENT_POINTER_IN_TPIDRURO > + select HAVE_ARCH_VMAP_STACK if THREAD_INFO_IN_TASK && (!LD_IS_LLD || LLD_VERSION >= 140000) > select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M > # Above selects are sorted alphabetically; please add new ones > # according to that. Thanks. > diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h > index 11b058a72a5b..7b871ed99ccf 100644 > --- a/arch/arm/include/asm/page.h > +++ b/arch/arm/include/asm/page.h > @@ -149,6 +149,10 @@ extern void copy_page(void *to, const void *from); > #include <asm/pgtable-2level-types.h> > #endif > > +#ifdef CONFIG_VMAP_STACK > +#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED > +#endif > + > #endif /* CONFIG_MMU */ > > typedef struct page *pgtable_t; > diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h > index 164e15f26485..004b89d86224 100644 > --- a/arch/arm/include/asm/thread_info.h > +++ b/arch/arm/include/asm/thread_info.h > @@ -25,6 +25,14 @@ > #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) > #define THREAD_START_SP (THREAD_SIZE - 8) > > +#ifdef CONFIG_VMAP_STACK > +#define THREAD_ALIGN (2 * THREAD_SIZE) > +#else > +#define THREAD_ALIGN THREAD_SIZE > +#endif > + > +#define OVERFLOW_STACK_SIZE SZ_4K > + > #ifndef __ASSEMBLY__ > > struct task_struct; > diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S > index b447f7d0708c..54210dce80e1 100644 > --- a/arch/arm/kernel/entry-armv.S > +++ b/arch/arm/kernel/entry-armv.S > @@ -57,6 +57,10 @@ UNWIND( .setfp fpreg, sp ) > @ > subs r2, sp, r0 @ SP above bottom of IRQ stack? > rsbscs r2, r2, #THREAD_SIZE @ ... and below the top? > +#ifdef CONFIG_VMAP_STACK > + ldr_l r2, high_memory, cc @ End of the linear region > + cmpcc r2, r0 @ Stack pointer was below it? > +#endif > movcs sp, r0 @ If so, revert to incoming SP > > #ifndef CONFIG_UNWINDER_ARM > @@ -188,13 +192,18 @@ ENDPROC(__und_invalid) > #define SPFIX(code...) > #endif > > - .macro svc_entry, stack_hole=0, trace=1, uaccess=1 > + .macro svc_entry, stack_hole=0, trace=1, uaccess=1, overflow_check=1 > UNWIND(.fnstart ) > - UNWIND(.save {r0 - pc} ) > sub sp, sp, #(SVC_REGS_SIZE + \stack_hole) > + THUMB( add sp, r1 ) @ get SP in a GPR without > + THUMB( sub r1, sp, r1 ) @ using a temp register > + > + .if \overflow_check > + UNWIND(.save {r0 - pc} ) > + do_overflow_check (SVC_REGS_SIZE + \stack_hole) > + .endif > + > #ifdef CONFIG_THUMB2_KERNEL > - add sp, r1 @ get SP in a GPR without > - sub r1, sp, r1 @ using a temp register > tst r1, #4 @ test stack pointer alignment > sub r1, sp, r1 @ restore original R0 > sub sp, r1 @ restore original SP > @@ -827,12 +836,20 @@ ENTRY(__switch_to) > str r7, [r8] > #endif > mov r0, r5 > -#if !defined(CONFIG_THUMB2_KERNEL) > +#if !defined(CONFIG_THUMB2_KERNEL) && !defined(CONFIG_VMAP_STACK) > set_current r7 > ldmia r4, {r4 - sl, fp, sp, pc} @ Load all regs saved previously > #else > mov r1, r7 > ldmia r4, {r4 - sl, fp, ip, lr} @ Load all regs saved previously > +#ifdef CONFIG_VMAP_STACK > + @ > + @ Do a dummy read from the new stack while running from the old one so > + @ that we can rely on do_translation_fault() to fix up any stale PMD > + @ entries covering the vmalloc region. > + @ > + ldr r2, [ip] > +#endif > > @ When CONFIG_THREAD_INFO_IN_TASK=n, the update of SP itself is what > @ effectuates the task switch, as that is what causes the observable > @@ -849,6 +866,76 @@ ENTRY(__switch_to) > UNWIND(.fnend ) > ENDPROC(__switch_to) > > +#ifdef CONFIG_VMAP_STACK > + .text > + .align 2 > +__bad_stack: > + @ > + @ We've just detected an overflow. We need to load the address of this > + @ CPU's overflow stack into the stack pointer register. We have only one > + @ scratch register so let's use a sequence of ADDs including one > + @ involving the PC, and decorate them with PC-relative group > + @ relocations. As these are ARM only, switch to ARM mode first. > + @ > + @ We enter here with IP clobbered and its value stashed on the mode > + @ stack. > + @ > +THUMB( bx pc ) > +THUMB( nop ) > +THUMB( .arm ) > + mrc p15, 0, ip, c13, c0, 4 @ Get per-CPU offset > + > + .globl overflow_stack_ptr > + .reloc 0f, R_ARM_ALU_PC_G0_NC, overflow_stack_ptr > + .reloc 1f, R_ARM_ALU_PC_G1_NC, overflow_stack_ptr > + .reloc 2f, R_ARM_LDR_PC_G2, overflow_stack_ptr > + add ip, ip, pc > +0: add ip, ip, #-4 > +1: add ip, ip, #0 > +2: ldr ip, [ip, #4] > + > + str sp, [ip, #-4]! @ Preserve original SP value > + mov sp, ip @ Switch to overflow stack > + pop {ip} @ Original SP in IP > + > +#if defined(CONFIG_UNWINDER_FRAME_POINTER) && defined(CONFIG_CC_IS_GCC) > + mov ip, ip @ mov expected by unwinder > + push {fp, ip, lr, pc} @ GCC flavor frame record > +#else > + str ip, [sp, #-8]! @ store original SP > + push {fpreg, lr} @ Clang flavor frame record > +#endif > +UNWIND( ldr ip, [r0, #4] ) @ load exception LR > +UNWIND( str ip, [sp, #12] ) @ store in the frame record > + ldr ip, [r0, #12] @ reload IP > + > + @ Store the original GPRs to the new stack. > + svc_entry uaccess=0, overflow_check=0 > + > +UNWIND( .save {sp, pc} ) > +UNWIND( .save {fpreg, lr} ) > +UNWIND( .setfp fpreg, sp ) > + > + ldr fpreg, [sp, #S_SP] @ Add our frame record > + @ to the linked list > +#if defined(CONFIG_UNWINDER_FRAME_POINTER) && defined(CONFIG_CC_IS_GCC) > + ldr r1, [fp, #4] @ reload SP at entry > + add fp, fp, #12 > +#else > + ldr r1, [fpreg, #8] > +#endif > + str r1, [sp, #S_SP] @ store in pt_regs > + > + @ Stash the regs for handle_bad_stack > + mov r0, sp > + > + @ Time to die > + bl handle_bad_stack > + nop > +UNWIND( .fnend ) > +ENDPROC(__bad_stack) > +#endif > + > __INIT > > /* > diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S > index ae24dd54e9ef..81df2a3561ca 100644 > --- a/arch/arm/kernel/entry-header.S > +++ b/arch/arm/kernel/entry-header.S > @@ -423,3 +423,40 @@ scno .req r7 @ syscall number > tbl .req r8 @ syscall table pointer > why .req r8 @ Linux syscall (!= 0) > tsk .req r9 @ current thread_info > + > + .macro do_overflow_check, frame_size:req > +#ifdef CONFIG_VMAP_STACK > + @ > + @ Test whether the SP has overflowed. Task and IRQ stacks are aligned > + @ so that SP & BIT(THREAD_SIZE_ORDER + PAGE_SHIFT) should always be > + @ zero. > + @ > +ARM( tst sp, #1 << (THREAD_SIZE_ORDER + PAGE_SHIFT) ) > +THUMB( tst r1, #1 << (THREAD_SIZE_ORDER + PAGE_SHIFT) ) > +THUMB( it ne ) > + bne .Lstack_overflow_check\@ > + > + .pushsection .text > +.Lstack_overflow_check\@: > + @ > + @ The stack pointer is not pointing to a valid vmap'ed stack, but it > + @ may be pointing into the linear map instead, which may happen if we > + @ are already running from the overflow stack. We cannot detect overflow > + @ in such cases so just carry on. > + @ > + str ip, [r0, #12] @ Stash IP on the mode stack > + ldr_l ip, high_memory @ Start of VMALLOC space > +ARM( cmp sp, ip ) @ SP in vmalloc space? > +THUMB( cmp r1, ip ) > +THUMB( itt lo ) > + ldrlo ip, [r0, #12] @ Restore IP > + blo .Lout\@ @ Carry on > + > +THUMB( sub r1, sp, r1 ) @ Restore original R1 > +THUMB( sub sp, r1 ) @ Restore original SP > + add sp, sp, #\frame_size @ Undo svc_entry's SP change > + b __bad_stack @ Handle VMAP stack overflow > + .popsection > +.Lout\@: > +#endif > + .endm > diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c > index e05219bca218..5deb40f39999 100644 > --- a/arch/arm/kernel/irq.c > +++ b/arch/arm/kernel/irq.c > @@ -56,7 +56,14 @@ static void __init init_irq_stacks(void) > int cpu; > > for_each_possible_cpu(cpu) { > - stack = (u8 *)__get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); > + if (!IS_ENABLED(CONFIG_VMAP_STACK)) > + stack = (u8 *)__get_free_pages(GFP_KERNEL, > + THREAD_SIZE_ORDER); > + else > + stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, > + THREADINFO_GFP, NUMA_NO_NODE, > + __builtin_return_address(0)); > + > if (WARN_ON(!stack)) > break; > per_cpu(irq_stack_ptr, cpu) = &stack[THREAD_SIZE]; > diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c > index 284a80c0b6e1..039feb7cd590 100644 > --- a/arch/arm/kernel/setup.c > +++ b/arch/arm/kernel/setup.c > @@ -141,10 +141,10 @@ EXPORT_SYMBOL(outer_cache); > int __cpu_architecture __read_mostly = CPU_ARCH_UNKNOWN; > > struct stack { > - u32 irq[3]; > - u32 abt[3]; > - u32 und[3]; > - u32 fiq[3]; > + u32 irq[4]; > + u32 abt[4]; > + u32 und[4]; > + u32 fiq[4]; > } ____cacheline_aligned; > > #ifndef CONFIG_CPU_V7M > diff --git a/arch/arm/kernel/sleep.S b/arch/arm/kernel/sleep.S > index 43077e11dafd..803b51e5cba0 100644 > --- a/arch/arm/kernel/sleep.S > +++ b/arch/arm/kernel/sleep.S > @@ -67,6 +67,14 @@ ENTRY(__cpu_suspend) > ldr r4, =cpu_suspend_size > #endif > mov r5, sp @ current virtual SP > +#ifdef CONFIG_VMAP_STACK > + @ Run the suspend code from the overflow stack so we don't have to rely > + @ on vmalloc-to-phys conversions anywhere in the arch suspend code. > + @ The original SP value captured in R5 will be restored on the way out. > + mov_l r6, overflow_stack_ptr @ Base pointer > + mrc p15, 0, r7, c13, c0, 4 @ Get per-CPU offset > + ldr sp, [r6, r7] @ Address of this CPU's overflow stack > +#endif > add r4, r4, #12 @ Space for pgd, virt sp, phys resume fn > sub sp, sp, r4 @ allocate CPU state on stack > ldr r3, =sleep_save_sp > diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c > index b42c446cec9a..b28a705c49cb 100644 > --- a/arch/arm/kernel/traps.c > +++ b/arch/arm/kernel/traps.c > @@ -121,7 +121,8 @@ void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl) > static int verify_stack(unsigned long sp) > { > if (sp < PAGE_OFFSET || > - (sp > (unsigned long)high_memory && high_memory != NULL)) > + (!IS_ENABLED(CONFIG_VMAP_STACK) && > + sp > (unsigned long)high_memory && high_memory != NULL)) > return -EFAULT; > > return 0; > @@ -291,7 +292,8 @@ static int __die(const char *str, int err, struct pt_regs *regs) > > if (!user_mode(regs) || in_interrupt()) { > dump_mem(KERN_EMERG, "Stack: ", regs->ARM_sp, > - ALIGN(regs->ARM_sp, THREAD_SIZE)); > + ALIGN(regs->ARM_sp - THREAD_SIZE, THREAD_ALIGN) > + + THREAD_SIZE); > dump_backtrace(regs, tsk, KERN_EMERG); > dump_instr(KERN_EMERG, regs); > } > @@ -838,3 +840,77 @@ void __init early_trap_init(void *vectors_base) > */ > #endif > } > + > +#ifdef CONFIG_VMAP_STACK > + > +DECLARE_PER_CPU(u8 *, irq_stack_ptr); > + > +asmlinkage DEFINE_PER_CPU(u8 *, overflow_stack_ptr); > + > +static int __init allocate_overflow_stacks(void) > +{ > + u8 *stack; > + int cpu; > + > + for_each_possible_cpu(cpu) { > + stack = (u8 *)__get_free_page(GFP_KERNEL); > + if (WARN_ON(!stack)) > + return -ENOMEM; > + per_cpu(overflow_stack_ptr, cpu) = &stack[OVERFLOW_STACK_SIZE]; > + } > + return 0; > +} > +early_initcall(allocate_overflow_stacks); > + > +asmlinkage void handle_bad_stack(struct pt_regs *regs) > +{ > + unsigned long tsk_stk = (unsigned long)current->stack; > + unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); > + unsigned long ovf_stk = (unsigned long)this_cpu_read(overflow_stack_ptr); > + > + console_verbose(); > + pr_emerg("Insufficient stack space to handle exception!"); > + > + pr_emerg("Task stack: [0x%08lx..0x%08lx]\n", > + tsk_stk, tsk_stk + THREAD_SIZE); > + pr_emerg("IRQ stack: [0x%08lx..0x%08lx]\n", > + irq_stk - THREAD_SIZE, irq_stk); > + pr_emerg("Overflow stack: [0x%08lx..0x%08lx]\n", > + ovf_stk - OVERFLOW_STACK_SIZE, ovf_stk); > + > + die("kernel stack overflow", regs, 0); > +} > + > +/* > + * Normally, we rely on the logic in do_translation_fault() to update stale PMD > + * entries covering the vmalloc space in a task's page tables when it first > + * accesses the region in question. Unfortunately, this is not sufficient when > + * the task stack resides in the vmalloc region, as do_translation_fault() is a > + * C function that needs a stack to run. > + * > + * So we need to ensure that these PMD entries are up to date *before* the MM > + * switch. As we already have some logic in the MM switch path that takes care > + * of this, let's trigger it by bumping the counter every time the core vmalloc > + * code modifies a PMD entry in the vmalloc region. > + */ > +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) > +{ > + if (start > VMALLOC_END || end < VMALLOC_START) > + return; > + > + /* > + * This hooks into the core vmalloc code to receive notifications of > + * any PMD level changes that have been made to the kernel page tables. > + * This means it should only be triggered once for every MiB worth of > + * vmalloc space, given that we don't support huge vmalloc/vmap on ARM, > + * and that kernel PMD level table entries are rarely (if ever) > + * updated. > + * > + * This means that the counter is going to max out at ~250 for the > + * typical case. If it overflows, something entirely unexpected has > + * occurred so let's throw a warning if that happens. > + */ > + WARN_ON(++init_mm.context.vmalloc_seq == UINT_MAX); > +} > + > +#endif > diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c > index e8d729975f12..c5ea328c428d 100644 > --- a/arch/arm/kernel/unwind.c > +++ b/arch/arm/kernel/unwind.c > @@ -389,7 +389,8 @@ int unwind_frame(struct stackframe *frame) > > /* store the highest address on the stack to avoid crossing it*/ > ctrl.sp_low = frame->sp; > - ctrl.sp_high = ALIGN(ctrl.sp_low, THREAD_SIZE); > + ctrl.sp_high = ALIGN(ctrl.sp_low - THREAD_SIZE, THREAD_ALIGN) > + + THREAD_SIZE; > > pr_debug("%s(pc = %08lx lr = %08lx sp = %08lx)\n", __func__, > frame->pc, frame->lr, frame->sp); > diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S > index f02d617e3359..aa12b65a7fd6 100644 > --- a/arch/arm/kernel/vmlinux.lds.S > +++ b/arch/arm/kernel/vmlinux.lds.S > @@ -138,12 +138,12 @@ SECTIONS > #ifdef CONFIG_STRICT_KERNEL_RWX > . = ALIGN(1<<SECTION_SHIFT); > #else > - . = ALIGN(THREAD_SIZE); > + . = ALIGN(THREAD_ALIGN); > #endif > __init_end = .; > > _sdata = .; > - RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) > + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) > _edata = .; > > BSS_SECTION(0, 0, 0) Best regards -- Marek Szyprowski, PhD Samsung R&D Institute Poland