> On Oct 6, 2022, at 4:15 PM, Luis Chamberlain <mcgrof@xxxxxxxxxx> wrote: > > On Thu, Aug 18, 2022 at 03:42:14PM -0700, Song Liu wrote: >> --- a/mm/nommu.c >> +++ b/mm/nommu.c >> @@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, >> } >> EXPORT_SYMBOL(vm_map_pages_zero); >> >> +void *vmalloc_exec(unsigned long size, unsigned long align) >> +{ >> + return NULL; >> +} > > Well that's not so nice for no-mmu systems. Shouldn't we have a > fallback? This is still early version, so I am not quite sure whether we need the fallback for no-mmu system. > >> diff --git a/mm/vmalloc.c b/mm/vmalloc.c >> index effd1ff6a4b4..472287e71bf1 100644 >> --- a/mm/vmalloc.c >> +++ b/mm/vmalloc.c >> @@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, >> va->va_end = addr + size; >> va->vm = NULL; >> >> - spin_lock(&vmap_area_lock); >> - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); >> - spin_unlock(&vmap_area_lock); >> + if (vm_flags & VM_KERNEL_EXEC) { >> + spin_lock(&free_text_area_lock); >> + insert_vmap_area(va, &free_text_area_root, &free_text_area_list); >> + /* update subtree_max_size now as we need this soon */ >> + augment_tree_propagate_from(va); > > Sorry, it is not clear to me why its needed only for exec, can you elaborate a > bit more? This version was wrong. We should use insert_vmap_area_augment() here. Actually, I changed this in latest version. > >> + spin_unlock(&free_text_area_lock); >> + } else { >> + spin_lock(&vmap_area_lock); >> + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); >> + spin_unlock(&vmap_area_lock); >> + } >> >> BUG_ON(!IS_ALIGNED(va->va_start, align)); >> BUG_ON(va->va_start < vstart); > > <-- snip --> > >> @@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size) >> } >> EXPORT_SYMBOL(vmalloc); >> >> +void *vmalloc_exec(unsigned long size, unsigned long align) >> +{ >> + struct vmap_area *va, *tmp; >> + unsigned long addr; >> + enum fit_type type; >> + int ret; >> + >> + va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE); >> + if (unlikely(!va)) >> + return ERR_PTR(-ENOMEM); >> + >> +again: >> + preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE); >> + tmp = find_vmap_lowest_match(free_text_area_root.rb_node, >> + size, align, 1, false); >> + >> + if (!tmp) { >> + unsigned long alloc_size; >> + void *ptr; >> + >> + spin_unlock(&free_text_area_lock); >> + >> + alloc_size = roundup(size, PMD_SIZE * num_online_nodes()); >> + ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR, >> + MODULES_END, GFP_KERNEL, PAGE_KERNEL, >> + VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD, >> + NUMA_NO_NODE, __builtin_return_address(0)); > > We can review the guard stuff on the other thread with Peter. > >> + if (unlikely(!ptr)) { >> + ret = -ENOMEM; >> + goto err_out; >> + } >> + memset(ptr, 0, alloc_size); >> + set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT); >> + set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT); > > I *really* like that this is now not something users have to muck with thanks! Well, this pushed some other complexity to the user side, for example, all those hacks with text_poke in 3/5. > >> + >> + goto again; >> + } >> + >> + addr = roundup(tmp->va_start, align); >> + type = classify_va_fit_type(tmp, addr, size); >> + if (WARN_ON_ONCE(type == NOTHING_FIT)) { >> + addr = -ENOMEM; >> + goto err_out; >> + } >> + >> + ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list, >> + tmp, addr, size, type); >> + if (ret) { >> + addr = ret; >> + goto err_out; >> + } >> + spin_unlock(&free_text_area_lock); >> + >> + va->va_start = addr; >> + va->va_end = addr + size; >> + va->vm = tmp->vm; >> + >> + spin_lock(&vmap_area_lock); >> + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); >> + spin_unlock(&vmap_area_lock); >> + >> + return (void *)addr; >> + >> +err_out: >> + spin_unlock(&free_text_area_lock); >> + return ERR_PTR(ret); >> +} >> + >> +void vfree_exec(const void *addr) >> +{ >> + struct vmap_area *va; >> + >> + might_sleep(); >> + >> + spin_lock(&vmap_area_lock); >> + va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node); >> + if (WARN_ON_ONCE(!va)) { >> + spin_unlock(&vmap_area_lock); >> + return; >> + } >> + >> + unlink_va(va, &vmap_area_root); > > Curious why we don't memset to 0 before merge_or_add_vmap_area_augment()? > I realize other code doesn't seem to do it, though. We should do the memset here. We will need the text_poke version of it. > >> + spin_unlock(&vmap_area_lock); >> + >> + spin_lock(&free_text_area_lock); >> + merge_or_add_vmap_area_augment(va, >> + &free_text_area_root, &free_text_area_list); > > I have concern that we can be using precious physically contigous memory > from huge pages to then end up in a situation where we create our own > pool and allow things to be non-contigous afterwards. > > I'm starting to suspect that if the allocation is > PAGE_SIZE we just > give it back generally. Otherwise wouldn't the fragmentation cause us > to eventually just eat up most huge pages available? Probably not for > eBPF but if we use this on a system with tons of module insertions / > deletions this seems like it could happen? Currently, bpf_prog_pack doesn't let allocation > PMD_SIZE to share with smaller allocations. I guess it is similar to the idea here? I am not sure what's the proper threshold for modules. We can discuss this later. Thanks, Song > > Luis > >> + spin_unlock(&free_text_area_lock); >> + /* TODO: when the whole vm_struct is not in use, free it */ >> +} >> + >> /** >> * vmalloc_huge - allocate virtually contiguous memory, allow huge pages >> * @size: allocation size >> @@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, >> /* It is a BUG(), but trigger recovery instead. */ >> goto recovery; >> >> - ret = adjust_va_to_fit_type(va, start, size, type); >> + ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list, >> + va, start, size, type); >> if (unlikely(ret)) >> goto recovery; >> >> -- >> 2.30.2 >>