On x86 kernel, we allocate 2MB pages for kernel text up to round_down(_etext, 2MB). Therefore, some of the kernel text is still on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to round_up(_etext, 2MB), and use the rest of the page for modules and BPF programs. Here is an example: [root@eth50-1 ~]# grep _etext /proc/kallsyms ffffffff82202a08 T _etext [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms | tail -n 3 ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup [bpf] ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new [bpf] ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch [bpf] [root@eth50-1 ~]# grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel 0xffffffff82200000-0xffffffff82400000 2M ro PSE x pmd [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms ffffffff822ba910 t xfs_flush_inodes_worker [xfs] ffffffff822bc580 t xfs_flush_inodes [xfs] ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, xfs module, and bpf programs. --- arch/x86/mm/init_64.c | 3 ++- mm/vmalloc.c | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 39c5246964a9..d27d0af5beb5 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1367,12 +1367,13 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask) int kernel_set_to_readonly; +#define PMD_ALIGN(x) (((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK) void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); unsigned long rodata_start = PFN_ALIGN(__start_rodata); unsigned long end = (unsigned long)__end_rodata_hpage_align; - unsigned long text_end = PFN_ALIGN(_etext); + unsigned long text_end = PMD_ALIGN(_etext); unsigned long rodata_end = PFN_ALIGN(__end_rodata); unsigned long all_end; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 472287e71bf1..5f3b5df9313f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -72,6 +72,11 @@ early_param("nohugevmalloc", set_nohugevmalloc); static const bool vmap_allow_huge = false; #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ +#define PMD_ALIGN(x) (((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK) + +static struct vm_struct text_tail_vm; +static struct vmap_area text_tail_va; + bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)kasan_reset_tag(x); @@ -634,6 +639,8 @@ int is_vmalloc_or_module_addr(const void *x) unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; + if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end) + return 1; #endif return is_vmalloc_addr(x); } @@ -2371,6 +2378,25 @@ static void vmap_init_free_space(void) } } +static void register_text_tail_vm(void) +{ + unsigned long start = PFN_ALIGN(_etext); + unsigned long end = PMD_ALIGN(_etext); + struct vmap_area *va; + + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (WARN_ON_ONCE(!va)) + return; + text_tail_vm.addr = (void *)start; + text_tail_vm.size = end - start; + text_tail_vm.flags = VM_KERNEL_EXEC; + text_tail_va.va_start = start; + text_tail_va.va_end = end; + text_tail_va.vm = &text_tail_vm; + memcpy(va, &text_tail_va, sizeof(*va)); + insert_vmap_area(va, &free_text_area_root, &free_text_area_list); +} + void __init vmalloc_init(void) { struct vmap_area *va; @@ -2381,6 +2407,7 @@ void __init vmalloc_init(void) * Create the cache for vmap_area objects. */ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); + register_text_tail_vm(); for_each_possible_cpu(i) { struct vmap_block_queue *vbq; -- 2.30.2