Instead of providing our own callbacks for walking the page tables, switch to using the generic version instead. Signed-off-by: Steven Price <steven.price@xxxxxxx> --- arch/x86/Kconfig | 1 + arch/x86/Kconfig.debug | 20 +-- arch/x86/mm/Makefile | 4 +- arch/x86/mm/dump_pagetables.c | 297 +++++++--------------------------- 4 files changed, 62 insertions(+), 260 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c1f9b3cf437c..122c24055f02 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -106,6 +106,7 @@ config X86 select GENERIC_IRQ_RESERVATION_MODE select GENERIC_IRQ_SHOW select GENERIC_PENDING_IRQ if SMP + select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 15d0fbe27872..dc1dfe213657 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -62,26 +62,10 @@ config EARLY_PRINTK_USB_XDBC config MCSAFE_TEST def_bool n -config X86_PTDUMP_CORE - def_bool n - -config X86_PTDUMP - tristate "Export kernel pagetable layout to userspace via debugfs" - depends on DEBUG_KERNEL - select DEBUG_FS - select X86_PTDUMP_CORE - ---help--- - Say Y here if you want to show the kernel pagetable layout in a - debugfs file. This information is only useful for kernel developers - who are working in architecture specific areas of the kernel. - It is probably not a good idea to enable this feature in a production - kernel. - If in doubt, say "N" - config EFI_PGT_DUMP bool "Dump the EFI pagetable" depends on EFI - select X86_PTDUMP_CORE + select PTDUMP_CORE ---help--- Enable this if you want to dump the EFI page table before enabling virtual mode. This can be used to debug miscellaneous @@ -90,7 +74,7 @@ config EFI_PGT_DUMP config DEBUG_WX bool "Warn on W+X mappings at boot" - select X86_PTDUMP_CORE + select PTDUMP_CORE ---help--- Generate a warning if any W+X mappings are found at boot. diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 4b101dd6e52f..5233190fc6bf 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -28,8 +28,8 @@ obj-$(CONFIG_X86_PAT) += pat_rbtree.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o -obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o +obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o +obj-$(CONFIG_PTDUMP_DEBUGFS) += debug_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index f6b814aaddf7..955824c7cddb 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -20,6 +20,7 @@ #include <linux/seq_file.h> #include <linux/highmem.h> #include <linux/pci.h> +#include <linux/ptdump.h> #include <asm/e820/types.h> #include <asm/pgtable.h> @@ -30,15 +31,12 @@ * when a "break" in the continuity is found. */ struct pg_state { + struct ptdump_state ptdump; int level; - pgprot_t current_prot; + pgprotval_t current_prot; pgprotval_t effective_prot; - pgprotval_t effective_prot_pgd; - pgprotval_t effective_prot_p4d; - pgprotval_t effective_prot_pud; - pgprotval_t effective_prot_pmd; + pgprotval_t prot_levels[5]; unsigned long start_address; - unsigned long current_address; const struct addr_marker *marker; unsigned long lines; bool to_dmesg; @@ -179,9 +177,8 @@ static struct addr_marker address_markers[] = { /* * Print a readable form of a pgprot_t to the seq_file */ -static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) +static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg) { - pgprotval_t pr = pgprot_val(prot); static const char * const level_name[] = { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; @@ -228,24 +225,11 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); } -/* - * On 64 bits, sign-extend the 48 bit address to 64 bit - */ -static unsigned long normalize_addr(unsigned long u) -{ - int shift; - if (!IS_ENABLED(CONFIG_X86_64)) - return u; - - shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); - return (signed long)(u << shift) >> shift; -} - -static void note_wx(struct pg_state *st) +static void note_wx(struct pg_state *st, unsigned long addr) { unsigned long npages; - npages = (st->current_address - st->start_address) / PAGE_SIZE; + npages = (addr - st->start_address) / PAGE_SIZE; #ifdef CONFIG_PCI_BIOS /* @@ -253,7 +237,7 @@ static void note_wx(struct pg_state *st) * Inform about it, but avoid the warning. */ if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && - st->current_address <= PAGE_OFFSET + BIOS_END) { + addr <= PAGE_OFFSET + BIOS_END) { pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); return; } @@ -264,25 +248,44 @@ static void note_wx(struct pg_state *st) (void *)st->start_address); } +static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) +{ + return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | + ((prot1 | prot2) & _PAGE_NX); +} + /* * This function gets called on a break in a continuous series * of PTE entries; the next one is different so we need to * print what we collected so far. */ -static void note_page(struct pg_state *st, pgprot_t new_prot, - pgprotval_t new_eff, int level) +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, + unsigned long val) { - pgprotval_t prot, cur, eff; + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + pgprotval_t new_prot, new_eff; + pgprotval_t cur, eff; static const char units[] = "BKMGTPE"; struct seq_file *m = st->seq; + new_prot = val & PTE_FLAGS_MASK; + + if (level > 1) { + new_eff = effective_prot(st->prot_levels[level - 2], + new_prot); + } else { + new_eff = new_prot; + } + + if (level > 0) + st->prot_levels[level-1] = new_eff; + /* * If we have a "break" in the series, we need to flush the state that * we have now. "break" is either changing perms, levels or * address space marker. */ - prot = pgprot_val(new_prot); - cur = pgprot_val(st->current_prot); + cur = st->current_prot; eff = st->effective_prot; if (!st->level) { @@ -294,14 +297,14 @@ static void note_page(struct pg_state *st, pgprot_t new_prot, st->lines = 0; pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", st->marker->name); - } else if (prot != cur || new_eff != eff || level != st->level || - st->current_address >= st->marker[1].start_address) { + } else if (new_prot != cur || new_eff != eff || level != st->level || + addr >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; int width = sizeof(unsigned long) * 2; if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) - note_wx(st); + note_wx(st, addr); /* * Now print the actual finished series @@ -311,9 +314,9 @@ static void note_page(struct pg_state *st, pgprot_t new_prot, pt_dump_seq_printf(m, st->to_dmesg, "0x%0*lx-0x%0*lx ", width, st->start_address, - width, st->current_address); + width, addr); - delta = st->current_address - st->start_address; + delta = addr - st->start_address; while (!(delta & 1023) && unit[1]) { delta >>= 10; unit++; @@ -331,7 +334,7 @@ static void note_page(struct pg_state *st, pgprot_t new_prot, * such as the start of vmalloc space etc. * This helps in the interpretation. */ - if (st->current_address >= st->marker[1].start_address) { + if (addr >= st->marker[1].start_address) { if (st->marker->max_lines && st->lines > st->marker->max_lines) { unsigned long nskip = @@ -347,228 +350,42 @@ static void note_page(struct pg_state *st, pgprot_t new_prot, st->marker->name); } - st->start_address = st->current_address; + st->start_address = addr; st->current_prot = new_prot; st->effective_prot = new_eff; st->level = level; } } -static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) -{ - return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | - ((prot1 | prot2) & _PAGE_NX); -} - -static int ptdump_pte_entry(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - struct pg_state *st = walk->private; - pgprotval_t eff, prot; - - st->current_address = normalize_addr(addr); - - prot = pte_flags(*pte); - eff = effective_prot(st->effective_prot_pmd, prot); - note_page(st, __pgprot(prot), eff, 5); - - return 0; -} - -#ifdef CONFIG_KASAN - -/* - * This is an optimization for KASAN=y case. Since all kasan page tables - * eventually point to the kasan_early_shadow_page we could call note_page() - * right away without walking through lower level page tables. This saves - * us dozens of seconds (minutes for 5-level config) while checking for - * W+X mapping or reading kernel_page_tables debugfs file. - */ -static inline bool kasan_page_table(struct pg_state *st, void *pt) -{ - if (__pa(pt) == __pa(kasan_early_shadow_pmd) || - (pgtable_l5_enabled() && - __pa(pt) == __pa(kasan_early_shadow_p4d)) || - __pa(pt) == __pa(kasan_early_shadow_pud)) { - pgprotval_t prot = pte_flags(kasan_early_shadow_pte[0]); - note_page(st, __pgprot(prot), 0, 5); - return true; - } - return false; -} -#else -static inline bool kasan_page_table(struct pg_state *st, void *pt) -{ - return false; -} -#endif - -static int ptdump_test_pmd(unsigned long addr, unsigned long next, - pmd_t *pmd, struct mm_walk *walk) -{ - struct pg_state *st = walk->private; - - st->current_address = normalize_addr(addr); - - if (kasan_page_table(st, pmd)) - return 1; - return 0; -} - -static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - struct pg_state *st = walk->private; - pgprotval_t eff, prot; - - prot = pmd_flags(*pmd); - eff = effective_prot(st->effective_prot_pud, prot); - - st->current_address = normalize_addr(addr); - - if (pmd_large(*pmd)) - note_page(st, __pgprot(prot), eff, 4); - - st->effective_prot_pmd = eff; - - return 0; -} - -static int ptdump_test_pud(unsigned long addr, unsigned long next, - pud_t *pud, struct mm_walk *walk) -{ - struct pg_state *st = walk->private; - - st->current_address = normalize_addr(addr); - - if (kasan_page_table(st, pud)) - return 1; - return 0; -} - -static int ptdump_pud_entry(pud_t *pud, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - struct pg_state *st = walk->private; - pgprotval_t eff, prot; - - prot = pud_flags(*pud); - eff = effective_prot(st->effective_prot_p4d, prot); - - st->current_address = normalize_addr(addr); - - if (pud_large(*pud)) - note_page(st, __pgprot(prot), eff, 3); - - st->effective_prot_pud = eff; - - return 0; -} - -static int ptdump_test_p4d(unsigned long addr, unsigned long next, - p4d_t *p4d, struct mm_walk *walk) -{ - struct pg_state *st = walk->private; - - st->current_address = normalize_addr(addr); - - if (kasan_page_table(st, p4d)) - return 1; - return 0; -} - -static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - struct pg_state *st = walk->private; - pgprotval_t eff, prot; - - prot = p4d_flags(*p4d); - eff = effective_prot(st->effective_prot_pgd, prot); - - st->current_address = normalize_addr(addr); - - if (p4d_large(*p4d)) - note_page(st, __pgprot(prot), eff, 2); - - st->effective_prot_p4d = eff; - - return 0; -} - -static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - struct pg_state *st = walk->private; - pgprotval_t eff, prot; +static const struct ptdump_range ptdump_ranges[] = { +#ifdef CONFIG_X86_64 - prot = pgd_flags(*pgd); +#define normalize_addr_shift (64 - (__VIRTUAL_MASK_SHIFT + 1)) +#define normalize_addr(u) ((signed long)(u << normalize_addr_shift) >> normalize_addr_shift) -#ifdef CONFIG_X86_PAE - eff = _PAGE_USER | _PAGE_RW; + {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2}, + {normalize_addr(PTRS_PER_PGD * PGD_LEVEL_MULT / 2), ~0UL}, #else - eff = prot; + {0, ~0UL}, #endif - - st->current_address = normalize_addr(addr); - - if (pgd_large(*pgd)) - note_page(st, __pgprot(prot), eff, 1); - - st->effective_prot_pgd = eff; - - return 0; -} - -static int ptdump_hole(unsigned long addr, unsigned long next, - struct mm_walk *walk) -{ - struct pg_state *st = walk->private; - - st->current_address = normalize_addr(addr); - - note_page(st, __pgprot(0), 0, -1); - - return 0; -} + {0, 0} +}; static void ptdump_walk_pgd_level_core(struct seq_file *m, struct mm_struct *mm, bool checkwx, bool dmesg) { - struct pg_state st = {}; - struct mm_walk walk = { - .mm = mm, - .pgd_entry = ptdump_pgd_entry, - .p4d_entry = ptdump_p4d_entry, - .pud_entry = ptdump_pud_entry, - .pmd_entry = ptdump_pmd_entry, - .pte_entry = ptdump_pte_entry, - .test_p4d = ptdump_test_p4d, - .test_pud = ptdump_test_pud, - .test_pmd = ptdump_test_pmd, - .pte_hole = ptdump_hole, - .private = &st + struct pg_state st = { + .ptdump = { + .note_page = note_page, + .range = ptdump_ranges + }, + .to_dmesg = dmesg, + .check_wx = checkwx, + .seq = m }; - st.to_dmesg = dmesg; - st.check_wx = checkwx; - st.seq = m; - if (checkwx) - st.wx_pages = 0; - - down_read(&mm->mmap_sem); -#ifdef CONFIG_X86_64 - walk_page_range(0, PTRS_PER_PGD*PGD_LEVEL_MULT/2, &walk); - walk_page_range(normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT/2), ~0, - &walk); -#else - walk_page_range(0, ~0, &walk); -#endif - up_read(&mm->mmap_sem); + ptdump_walk_pgd(&st.ptdump, mm); - /* Flush out the last page */ - st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); - note_page(&st, __pgprot(0), 0, 0); if (!checkwx) return; if (st.wx_pages) -- 2.20.1