On Mon, 2024-08-19 at 20:24 +0100, Mark Brown wrote: [snip] > > diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c > index 059685612362..42b2b18de20d 100644 > --- a/arch/x86/kernel/shstk.c > +++ b/arch/x86/kernel/shstk.c > @@ -191,44 +191,103 @@ void reset_thread_features(void) > current->thread.features_locked = 0; > } > > -unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long > clone_flags, > - unsigned long stack_size) > +int arch_shstk_validate_clone(struct task_struct *t, > + struct vm_area_struct *vma, > + struct page *page, > + struct kernel_clone_args *args) > +{ > + /* > + * SSP is aligned, so reserved bits and mode bit are a zero, just mark > + * the token 64-bit. > + */ > + void *maddr = kmap_local_page(page); > + int offset; > + unsigned long addr, ssp; > + u64 expected; > + u64 val; > + > + if (!features_enabled(ARCH_SHSTK_SHSTK)) > + return 0; > + > + ssp = args->shadow_stack + args->shadow_stack_size; > + addr = ssp - SS_FRAME_SIZE; > + expected = ssp | BIT(0); > + offset = offset_in_page(ssp); > + > + /* This should really be an atomic cmpxchg. It is not. */ > + copy_from_user_page(vma, page, addr, &val, maddr + offset, > + sizeof(val)); Were so close to the real cmpxchg at this point. I took a shot at it with the diff at the end. I'm not sure if it might need some of the instrumentation calls. > + > + if (val != expected) > + return false; Return false for an int will be 0 (i.e. success). I think it might be covering up a bug. The gup happens to args->shadow_stack + args->shadow_stack_size - 1 (the size inclusive). But the copy happens at the size exclusive. So shadow_stack_size = PAGE_SIZE, will try to read the token at the start of the shadow stack, but the failure will be reported as success. I think... On another note, I think we need to verify that ssp is 8 byte aligned, or it could be made to overflow the adjacent direct map page a few bytes. At least I didn't see how it was prevented. > + val = 0; > + > + copy_to_user_page(vma, page, addr, maddr + offset, &val, sizeof(val)); > + set_page_dirty_lock(page); > + > + return 0; > +} > + > [snip] > > +static int shstk_validate_clone(struct task_struct *p, > + struct kernel_clone_args *args) > +{ > + struct mm_struct *mm; > + struct vm_area_struct *vma; > + struct page *page; > + unsigned long addr; > + int ret; > + > + if (!IS_ENABLED(CONFIG_ARCH_HAS_USER_SHADOW_STACK)) > + return 0; > + > + if (!args->shadow_stack) > + return 0; > + > + mm = get_task_mm(p); > + if (!mm) > + return -EFAULT; > + > + mmap_read_lock(mm); > + > + /* > + * All current shadow stack architectures have tokens at the > + * top of a downward growing shadow stack. > + */ > + addr = args->shadow_stack + args->shadow_stack_size - 1; > + addr = untagged_addr_remote(mm, addr); > + > + page = get_user_page_vma_remote(mm, addr, FOLL_FORCE | FOLL_WRITE, > + &vma); > + if (IS_ERR(page)) { > + ret = -EFAULT; > + goto out; > + } > + > + if (!(vma->vm_flags & VM_SHADOW_STACK)) { Can we check VM_WRITE here too? At least on x86, shadow stacks can be mprotect()ed as read-only. The reason for this before I think fell out of the implementation details, but all the same it would be nice be consistent. Then it should behave identically to a real shadow stack access. > + ret = -EFAULT; > + goto out_page; > + } > + > + ret = arch_shstk_validate_clone(p, vma, page, args); > + > +out_page: > + put_page(page); > +out: > + mmap_read_unlock(mm); > + mmput(mm); > + return ret; > +} > + > [snip] > > +/** > + * clone3_shadow_stack_valid - check and prepare shadow stack > + * @kargs: kernel clone args > + * > + * Verify that shadow stacks are only enabled if supported. > + */ > +static inline bool clone3_shadow_stack_valid(struct kernel_clone_args *kargs) > +{ > + if (kargs->shadow_stack) { > + if (!kargs->shadow_stack_size) > + return false; > + > + if (kargs->shadow_stack_size < SHADOW_STACK_SIZE_MIN) > + return false; > + > + if (kargs->shadow_stack_size > rlimit(RLIMIT_STACK)) > + return false; At the risk of asking a stupid question or one that I should have asked a long time ago... Why do we need both shadow_stack and shadow_stack_size? We are basically asking it to consume a token at a pointer and have userspace manage the shadow stack itself. So why does the kernel care what size it is? Couldn't we just have 'shadow_stack' have that mean consume a token here. > + > + /* > + * The architecture must check support on the specific > + * machine. > + */ > + return IS_ENABLED(CONFIG_ARCH_HAS_USER_SHADOW_STACK); > + } else { > + return !kargs->shadow_stack_size; > + } > +} > + Fixing some of mentioned bugs, this on top passed the selftests for me. It doesn't have the 8 byte alignment check I mentioned because I'm less sure I might be missing it somewhere. diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 42b2b18de20d..2685180b8c5c 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -204,7 +204,6 @@ int arch_shstk_validate_clone(struct task_struct *t, int offset; unsigned long addr, ssp; u64 expected; - u64 val; if (!features_enabled(ARCH_SHSTK_SHSTK)) return 0; @@ -212,17 +211,12 @@ int arch_shstk_validate_clone(struct task_struct *t, ssp = args->shadow_stack + args->shadow_stack_size; addr = ssp - SS_FRAME_SIZE; expected = ssp | BIT(0); - offset = offset_in_page(ssp); + offset = offset_in_page(addr); - /* This should really be an atomic cmpxchg. It is not. */ - copy_from_user_page(vma, page, addr, &val, maddr + offset, - sizeof(val)); + if (!cmpxchg_to_user_page(vma, page, addr, (unsigned long *)(maddr + offset), + expected, 0)) + return -EINVAL; - if (val != expected) - return false; - val = 0; - - copy_to_user_page(vma, page, addr, maddr + offset, &val, sizeof(val)); set_page_dirty_lock(page); return 0; diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 7ee8a179d103..1500d49bc3f7 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -124,4 +124,15 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) } while (0) #endif +#ifndef cmpxchg_to_user_page +#define cmpxchg_to_user_page(vma, page, vaddr, ptr, old, new) \ +({ \ + bool ret; \ + \ + ret = try_cmpxchg(ptr, &old, new); \ + flush_icache_user_page(vma, page, vaddr, sizeof(*ptr)); \ + ret; \ +}) +#endif + #endif /* _ASM_GENERIC_CACHEFLUSH_H */