> On Oct 10, 2022, at 11:32 AM, Edgecombe, Rick P <rick.p.edgecombe@xxxxxxxxx> wrote: > > On Fri, 2022-10-07 at 16:43 -0700, Song Liu wrote: >> On x86 kernel, we allocate 2MB pages for kernel text up to >> round_down(_etext, 2MB). Therefore, some of the kernel text is still >> on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to >> round_up(_etext, 2MB), and use the rest of the page for modules and >> BPF programs. >> >> Here is an example: >> >> [root@eth50-1 ~]# grep _etext /proc/kallsyms >> ffffffff82202a08 T _etext >> >> [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms | tail -n 3 >> ffffffff8220f920 t >> bpf_prog_cc61a5364ac11d93_handle__sched_wakeup [bpf] >> ffffffff8220fa28 t >> bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new [bpf] >> ffffffff8220fad4 t >> bpf_prog_3bf73fa16f5e3d92_handle__sched_switch [bpf] >> >> [root@eth50-1 ~]# grep 0xffffffff82200000 >> /sys/kernel/debug/page_tables/kernel >> 0xffffffff82200000- >> 0xffffffff82400000 2M ro PSE x pmd >> >> [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms >> ffffffff822ba910 t xfs_flush_inodes_worker [xfs] >> ffffffff822bc580 t xfs_flush_inodes [xfs] >> >> ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, >> xfs >> module, and bpf programs. > > Can this memory range be freed as part of a vfree_exec() call then? > Does vmalloc actually try to unmap it? If so, it could get complicated > with PTI. > > It probably should be a special case that never gets fully freed. Right, this is never freed. > >> >> Signed-off-by: Song Liu <song@xxxxxxxxxx> >> --- >> arch/x86/mm/init_64.c | 3 ++- >> mm/vmalloc.c | 24 ++++++++++++++++++++++++ >> 2 files changed, 26 insertions(+), 1 deletion(-) >> >> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c >> index 0fe690ebc269..d94f196c541a 100644 >> --- a/arch/x86/mm/init_64.c >> +++ b/arch/x86/mm/init_64.c >> @@ -1367,12 +1367,13 @@ int __init >> deferred_page_init_max_threads(const struct cpumask *node_cpumask) >> >> int kernel_set_to_readonly; >> >> +#define PMD_ALIGN(x) (((unsigned long)(x) + (PMD_SIZE - 1)) & >> PMD_MASK) >> void mark_rodata_ro(void) >> { >> unsigned long start = PFN_ALIGN(_text); >> unsigned long rodata_start = PFN_ALIGN(__start_rodata); >> unsigned long end = (unsigned long)__end_rodata_hpage_align; >> - unsigned long text_end = PFN_ALIGN(_etext); >> + unsigned long text_end = PMD_ALIGN(_etext); > > This should probably have more logic and adjustments. If etext is PMD > aligned, some of the stuff outside the diff won't do anything. Hmm.. I don't quite follow this comment. If the etext is PMD aligned, we can still use vmalloc_exec to allocate memory. So it shouldn't matter, no? > > Also, if a kernel doesn't have modules or BPF JIT it would be a waste > of memory. I guess we can add a command line argument for these corner cases? Thanks, Song > >> unsigned long rodata_end = PFN_ALIGN(__end_rodata); >> unsigned long all_end; >> >> diff --git a/mm/vmalloc.c b/mm/vmalloc.c >> index 9212ff96b871..41509bbec583 100644 >> --- a/mm/vmalloc.c >> +++ b/mm/vmalloc.c >> @@ -75,6 +75,9 @@ static const bool vmap_allow_huge = false; >> #define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE) >> #define PMD_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PMD_SIZE) >> >> +static struct vm_struct text_tail_vm; >> +static struct vmap_area text_tail_va; >> + >> bool is_vmalloc_addr(const void *x) >> { >> unsigned long addr = (unsigned long)kasan_reset_tag(x); >> @@ -637,6 +640,8 @@ int is_vmalloc_or_module_addr(const void *x) >> unsigned long addr = (unsigned long)kasan_reset_tag(x); >> if (addr >= MODULES_VADDR && addr < MODULES_END) >> return 1; >> + if (addr >= text_tail_va.va_start && addr < >> text_tail_va.va_end) >> + return 1; >> #endif >> return is_vmalloc_addr(x); >> } >> @@ -2422,6 +2427,24 @@ static void vmap_init_free_space(void) >> } >> } >> >> +static void register_text_tail_vm(void) >> +{ >> + unsigned long start = PFN_ALIGN((unsigned long)_etext); >> + unsigned long end = PMD_ALIGN((unsigned long)_etext); >> + struct vmap_area *va; >> + >> + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); >> + if (WARN_ON_ONCE(!va)) >> + return; >> + text_tail_vm.addr = (void *)start; >> + text_tail_vm.size = end - start; >> + text_tail_va.va_start = start; >> + text_tail_va.va_end = end; >> + text_tail_va.vm = &text_tail_vm; >> + memcpy(va, &text_tail_va, sizeof(*va)); >> + insert_vmap_area_augment(va, NULL, &free_text_area_root, >> &free_text_area_list); >> +} >> + >> void __init vmalloc_init(void) >> { >> struct vmap_area *va; >> @@ -2432,6 +2455,7 @@ void __init vmalloc_init(void) >> * Create the cache for vmap_area objects. >> */ >> vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); >> + register_text_tail_vm(); >> >> for_each_possible_cpu(i) { >> struct vmap_block_queue *vbq;