This is a prototype to host dynamic kernel text (modules, BPF programs, etc.) with huge pages. This is similar to the proposal by Peter in [1]. A new tree of vmap_area, free_text_area_* tree, is introduced in addition to free_vmap_area_* and vmap_area_*. vmalloc_exec allocates pages from free_text_area_*. When there isn't enough space left in free_text_area_*, new PMD_SIZE page(s) is allocated from free_vmap_area_* and added to free_text_area_*. The new tree allows separate handling of < PAGE_SIZE allocations, as current vmalloc code mostly assumes PAGE_SIZE aligned allocations. This version of vmalloc_exec can handle bpf programs, which uses 64 byte aligned allocations), and modules, which uses PAGE_SIZE aligned allocations. [1] https://lore.kernel.org/bpf/Ys6cWUMHO8XwyYgr@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/ --- include/linux/vmalloc.h | 4 + mm/nommu.c | 7 ++ mm/vmalloc.c | 163 +++++++++++++++++++++++++++++++++------- 3 files changed, 147 insertions(+), 27 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 096d48aa3437..691c02ffe3db 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -35,6 +35,8 @@ struct notifier_block; /* in notifier.h */ #define VM_DEFER_KMEMLEAK 0 #endif +#define VM_KERNEL_EXEC 0x00001000 /* kernel text mapped as RO+X */ + /* bits [20..32] reserved for arch specific ioremap internals */ /* @@ -154,6 +156,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) __alloc_size(1); void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +void *vmalloc_exec(unsigned long size, unsigned long align) __alloc_size(1); +void vfree_exec(const void *addr); extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2); diff --git a/mm/nommu.c b/mm/nommu.c index 9d7afc2d959e..11e0fc996006 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, } EXPORT_SYMBOL(vm_map_pages_zero); +void *vmalloc_exec(unsigned long size, unsigned long align) +{ + return NULL; +} + +void vfree_exec(const void *addr) { } + /* * sys_brk() for the most part doesn't need the global kernel * lock, except when an application is doing something nasty diff --git a/mm/vmalloc.c b/mm/vmalloc.c index effd1ff6a4b4..472287e71bf1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -753,6 +753,10 @@ static LIST_HEAD(free_vmap_area_list); */ static struct rb_root free_vmap_area_root = RB_ROOT; +static DEFINE_SPINLOCK(free_text_area_lock); +static LIST_HEAD(free_text_area_list); +static struct rb_root free_text_area_root = RB_ROOT; + /* * Preload a CPU with one object for "no edge" split case. The * aim is to get rid of allocations from the atomic context, thus @@ -814,9 +818,11 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) return va; } -static struct vmap_area *__find_vmap_area(unsigned long addr) +static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_node *root) { - struct rb_node *n = vmap_area_root.rb_node; + struct rb_node *n; + + n = root ? root : vmap_area_root.rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); @@ -926,7 +932,7 @@ link_va(struct vmap_area *va, struct rb_root *root, /* Insert to the rb-tree */ rb_link_node(&va->rb_node, parent, link); - if (root == &free_vmap_area_root) { + if (root == &free_vmap_area_root || root == &free_text_area_root) { /* * Some explanation here. Just perform simple insertion * to the tree. We do not set va->subtree_max_size to @@ -955,7 +961,7 @@ unlink_va(struct vmap_area *va, struct rb_root *root) if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) return; - if (root == &free_vmap_area_root) + if (root == &free_vmap_area_root || root == &free_text_area_root) rb_erase_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); else @@ -1198,15 +1204,15 @@ is_within_this_va(struct vmap_area *va, unsigned long size, * overhead. */ static __always_inline struct vmap_area * -find_vmap_lowest_match(unsigned long size, unsigned long align, - unsigned long vstart, bool adjust_search_size) +find_vmap_lowest_match(struct rb_node *root, unsigned long size, + unsigned long align, unsigned long vstart, bool adjust_search_size) { struct vmap_area *va; struct rb_node *node; unsigned long length; /* Start from the root. */ - node = free_vmap_area_root.rb_node; + node = root; /* Adjust the search size for alignment overhead. */ length = adjust_search_size ? size + align - 1 : size; @@ -1290,8 +1296,9 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align) get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; - va_1 = find_vmap_lowest_match(size, align, vstart, false); - va_2 = find_vmap_lowest_linear_match(size, align, vstart); + va_1 = find_vmap_lowest_match(free_vmap_area_root.rb_node, size, + align, vstart, false); + va_2 = find_vmap_lowest_linear_match(root, size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", @@ -1334,7 +1341,8 @@ classify_va_fit_type(struct vmap_area *va, } static __always_inline int -adjust_va_to_fit_type(struct vmap_area *va, +adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, + struct vmap_area *va, unsigned long nva_start_addr, unsigned long size, enum fit_type type) { @@ -1348,7 +1356,7 @@ adjust_va_to_fit_type(struct vmap_area *va, * V NVA V * |---------------| */ - unlink_va(va, &free_vmap_area_root); + unlink_va(va, root); kmem_cache_free(vmap_area_cachep, va); } else if (type == LE_FIT_TYPE) { /* @@ -1426,8 +1434,7 @@ adjust_va_to_fit_type(struct vmap_area *va, augment_tree_propagate_from(va); if (lva) /* type == NE_FIT_TYPE */ - insert_vmap_area_augment(lva, &va->rb_node, - &free_vmap_area_root, &free_vmap_area_list); + insert_vmap_area_augment(lva, &va->rb_node, root, head); } return 0; @@ -1459,7 +1466,8 @@ __alloc_vmap_area(unsigned long size, unsigned long align, if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) adjust_search_size = false; - va = find_vmap_lowest_match(size, align, vstart, adjust_search_size); + va = find_vmap_lowest_match(free_vmap_area_root.rb_node, + size, align, vstart, adjust_search_size); if (unlikely(!va)) return vend; @@ -1478,7 +1486,8 @@ __alloc_vmap_area(unsigned long size, unsigned long align, return vend; /* Update the free vmap_area. */ - ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); + ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list, + va, nva_start_addr, size, type); if (ret) return vend; @@ -1539,7 +1548,7 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, - int node, gfp_t gfp_mask) + int node, unsigned long vm_flags, gfp_t gfp_mask) { struct vmap_area *va; unsigned long freed; @@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, va->va_end = addr + size; va->vm = NULL; - spin_lock(&vmap_area_lock); - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); - spin_unlock(&vmap_area_lock); + if (vm_flags & VM_KERNEL_EXEC) { + spin_lock(&free_text_area_lock); + insert_vmap_area(va, &free_text_area_root, &free_text_area_list); + /* update subtree_max_size now as we need this soon */ + augment_tree_propagate_from(va); + spin_unlock(&free_text_area_lock); + } else { + spin_lock(&vmap_area_lock); + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + spin_unlock(&vmap_area_lock); + } BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); @@ -1803,7 +1820,7 @@ struct vmap_area *find_vmap_area(unsigned long addr) struct vmap_area *va; spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr); + va = __find_vmap_area(addr, vmap_area_root.rb_node); spin_unlock(&vmap_area_lock); return va; @@ -1912,8 +1929,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) return ERR_PTR(-ENOMEM); va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, - VMALLOC_START, VMALLOC_END, - node, gfp_mask); + VMALLOC_START, VMALLOC_END, + node, 0, gfp_mask); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); @@ -2209,8 +2226,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) addr = (unsigned long)mem; } else { struct vmap_area *va; - va = alloc_vmap_area(size, PAGE_SIZE, - VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END, + node, 0, GFP_KERNEL); if (IS_ERR(va)) return NULL; @@ -2450,7 +2467,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; - va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + va = alloc_vmap_area(size, align, start, end, node, flags, gfp_mask); if (IS_ERR(va)) { kfree(area); return NULL; @@ -2546,7 +2563,7 @@ struct vm_struct *remove_vm_area(const void *addr) might_sleep(); spin_lock(&vmap_area_lock); - va = __find_vmap_area((unsigned long)addr); + va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node); if (va && va->vm) { struct vm_struct *vm = va->vm; @@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size) } EXPORT_SYMBOL(vmalloc); +void *vmalloc_exec(unsigned long size, unsigned long align) +{ + struct vmap_area *va, *tmp; + unsigned long addr; + enum fit_type type; + int ret; + + va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + +again: + preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE); + tmp = find_vmap_lowest_match(free_text_area_root.rb_node, + size, align, 1, false); + + if (!tmp) { + unsigned long alloc_size; + void *ptr; + + spin_unlock(&free_text_area_lock); + + alloc_size = roundup(size, PMD_SIZE * num_online_nodes()); + ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR, + MODULES_END, GFP_KERNEL, PAGE_KERNEL, + VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD, + NUMA_NO_NODE, __builtin_return_address(0)); + if (unlikely(!ptr)) { + ret = -ENOMEM; + goto err_out; + } + memset(ptr, 0, alloc_size); + set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT); + set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT); + + goto again; + } + + addr = roundup(tmp->va_start, align); + type = classify_va_fit_type(tmp, addr, size); + if (WARN_ON_ONCE(type == NOTHING_FIT)) { + addr = -ENOMEM; + goto err_out; + } + + ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list, + tmp, addr, size, type); + if (ret) { + addr = ret; + goto err_out; + } + spin_unlock(&free_text_area_lock); + + va->va_start = addr; + va->va_end = addr + size; + va->vm = tmp->vm; + + spin_lock(&vmap_area_lock); + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + spin_unlock(&vmap_area_lock); + + return (void *)addr; + +err_out: + spin_unlock(&free_text_area_lock); + return ERR_PTR(ret); +} + +void vfree_exec(const void *addr) +{ + struct vmap_area *va; + + might_sleep(); + + spin_lock(&vmap_area_lock); + va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node); + if (WARN_ON_ONCE(!va)) { + spin_unlock(&vmap_area_lock); + return; + } + + unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + + spin_lock(&free_text_area_lock); + merge_or_add_vmap_area_augment(va, + &free_text_area_root, &free_text_area_list); + spin_unlock(&free_text_area_lock); + /* TODO: when the whole vm_struct is not in use, free it */ +} + /** * vmalloc_huge - allocate virtually contiguous memory, allow huge pages * @size: allocation size @@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, /* It is a BUG(), but trigger recovery instead. */ goto recovery; - ret = adjust_va_to_fit_type(va, start, size, type); + ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list, + va, start, size, type); if (unlikely(ret)) goto recovery; -- 2.30.2