This is a note to let you know that I've just added the patch titled kaiser: do not set _PAGE_NX on pgd_none to the 4.9-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary The filename of the patch is: kaiser-do-not-set-_page_nx-on-pgd_none.patch and it can be found in the queue-4.9 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable@xxxxxxxxxxxxxxx> know about it. >From foo@baz Wed Jan 3 20:37:21 CET 2018 From: Hugh Dickins <hughd@xxxxxxxxxx> Date: Tue, 5 Sep 2017 12:05:01 -0700 Subject: kaiser: do not set _PAGE_NX on pgd_none From: Hugh Dickins <hughd@xxxxxxxxxx> native_pgd_clear() uses native_set_pgd(), so native_set_pgd() must avoid setting the _PAGE_NX bit on an otherwise pgd_none() entry: usually that just generated a warning on exit, but sometimes more mysterious and damaging failures (our production machines could not complete booting). The original fix to this just avoided adding _PAGE_NX to an empty entry; but eventually more problems surfaced with kexec, and EFI mapping expected to be a problem too. So now instead change native_set_pgd() to update shadow only if _PAGE_USER: A few places (kernel/machine_kexec_64.c, platform/efi/efi_64.c for sure) use set_pgd() to set up a temporary internal virtual address space, with physical pages remapped at what Kaiser regards as userspace addresses: Kaiser then assumes a shadow pgd follows, which it will try to corrupt. This appears to be responsible for the recent kexec and kdump failures; though it's unclear how those did not manifest as a problem before. Ah, the shadow pgd will only be assumed to "follow" if the requested pgd is on an even-numbered page: so I suppose it was going wrong 50% of the time all along. What we need is a flag to set_pgd(), to tell it we're dealing with userspace. Er, isn't that what the pgd's _PAGE_USER bit is saying? Add a test for that. But we cannot do the same for pgd_clear() (which may be called to clear corrupted entries - set aside the question of "corrupt in which pgd?" until later), so there just rely on pgd_clear() not being called in the problematic cases - with a WARN_ON_ONCE() which should fire half the time if it is. But this is getting too big for an inline function: move it into arch/x86/mm/kaiser.c (which then demands a boot/compressed mod); and de-void and de-space native_get_shadow/normal_pgd() while here. Also make an unnecessary change to KASLR's init_trampoline(): it was using set_pgd() to assign a pgd-value to a global variable (not in a pg directory page), which was rather scary given Kaiser's previous set_pgd() implementation: not a problem now, but too scary to leave as was, it could easily blow up if we have to change set_pgd() again. Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- arch/x86/boot/compressed/misc.h | 1 arch/x86/include/asm/pgtable_64.h | 51 +++++++++----------------------------- arch/x86/mm/kaiser.c | 42 +++++++++++++++++++++++++++++++ arch/x86/mm/kaslr.c | 4 +- 4 files changed, 58 insertions(+), 40 deletions(-) --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -9,6 +9,7 @@ */ #undef CONFIG_PARAVIRT #undef CONFIG_PARAVIRT_SPINLOCKS +#undef CONFIG_KAISER #undef CONFIG_KASAN #include <linux/linkage.h> --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -107,61 +107,36 @@ static inline void native_pud_clear(pud_ } #ifdef CONFIG_KAISER -static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); + +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) { - return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); } -static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) { - return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); + return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); } #else -static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) { BUILD_BUG_ON(1); return NULL; } -static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) { return pgdp; } #endif /* CONFIG_KAISER */ -/* - * Page table pages are page-aligned. The lower half of the top - * level is used for userspace and the top half for the kernel. - * This returns true for user pages that need to get copied into - * both the user and kernel copies of the page tables, and false - * for kernel pages that should only be in the kernel copy. - */ -static inline bool is_userspace_pgd(void *__ptr) -{ - unsigned long ptr = (unsigned long)__ptr; - - return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2)); -} - static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { -#ifdef CONFIG_KAISER - pteval_t extra_kern_pgd_flags = 0; - /* Do we need to also populate the shadow pgd? */ - if (is_userspace_pgd(pgdp)) { - native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; - /* - * Even if the entry is *mapping* userspace, ensure - * that userspace can not use it. This way, if we - * get out to userspace running on the kernel CR3, - * userspace will crash instead of running. - */ - extra_kern_pgd_flags = _PAGE_NX; - } - pgdp->pgd = pgd.pgd; - pgdp->pgd |= extra_kern_pgd_flags; -#else /* CONFIG_KAISER */ - *pgdp = pgd; -#endif + *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); } static inline void native_pgd_clear(pgd_t *pgd) --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -302,4 +302,46 @@ void kaiser_remove_mapping(unsigned long unmap_pud_range_nofree(pgd, addr, end); } } + +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * This returns true for user pages that need to get copied into + * both the user and kernel copies of the page tables, and false + * for kernel pages that should only be in the kernel copy. + */ +static inline bool is_userspace_pgd(pgd_t *pgdp) +{ + return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); +} + +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) +{ + /* + * Do we need to also populate the shadow pgd? Check _PAGE_USER to + * skip cases like kexec and EFI which make temporary low mappings. + */ + if (pgd.pgd & _PAGE_USER) { + if (is_userspace_pgd(pgdp)) { + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + /* + * Even if the entry is *mapping* userspace, ensure + * that userspace can not use it. This way, if we + * get out to userspace running on the kernel CR3, + * userspace will crash instead of running. + */ + pgd.pgd |= _PAGE_NX; + } + } else if (!pgd.pgd) { + /* + * pgd_clear() cannot check _PAGE_USER, and is even used to + * clear corrupted pgd entries: so just rely on cases like + * kexec and EFI never to be using pgd_clear(). + */ + if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && + is_userspace_pgd(pgdp)) + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + } + return pgd; +} #endif /* CONFIG_KAISER */ --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -189,6 +189,6 @@ void __meminit init_trampoline(void) *pud_tramp = *pud; } - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); + /* Avoid set_pgd(), in case it's complicated by CONFIG_KAISER */ + trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); } Patches currently in stable-queue which might be from hughd@xxxxxxxxxx are queue-4.9/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch queue-4.9/kaiser-add-nokaiser-boot-option-using-alternative.patch queue-4.9/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch queue-4.9/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch queue-4.9/x86-paravirt-dont-patch-flush_tlb_single.patch queue-4.9/kaiser-merged-update.patch queue-4.9/kaiser-delete-kaiser_real_switch-option.patch queue-4.9/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch queue-4.9/kaiser-fix-perf-crashes.patch queue-4.9/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch queue-4.9/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch queue-4.9/kaiser-enhanced-by-kernel-and-user-pcids.patch queue-4.9/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch queue-4.9/kaiser-align-addition-to-x86-mm-makefile.patch queue-4.9/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch queue-4.9/kaiser-stack-map-page_size-at-thread_size-page_size.patch queue-4.9/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch queue-4.9/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch queue-4.9/kaiser-do-not-set-_page_nx-on-pgd_none.patch queue-4.9/kaiser-tidied-up-asm-kaiser.h-somewhat.patch queue-4.9/kaiser-cleanups-while-trying-for-gold-link.patch queue-4.9/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch queue-4.9/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch queue-4.9/kaiser-kernel-address-isolation.patch queue-4.9/kaiser-enomem-if-kaiser_pagetable_walk-null.patch queue-4.9/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch queue-4.9/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch queue-4.9/kaiser-kaiser-depends-on-smp.patch queue-4.9/kaiser-pcid-0-for-kernel-and-128-for-user.patch