[RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is a prototype to host dynamic kernel text (modules, BPF programs,
etc.) with huge pages. This is similar to the proposal by Peter in [1].

A new tree of vmap_area, free_text_area_* tree, is introduced in addition
to free_vmap_area_* and vmap_area_*. vmalloc_exec allocates pages from
free_text_area_*. When there isn't enough space left in free_text_area_*,
new PMD_SIZE page(s) is allocated from free_vmap_area_* and added to
free_text_area_*.

The new tree allows separate handling of < PAGE_SIZE allocations, as
current vmalloc code mostly assumes PAGE_SIZE aligned allocations. This
version of vmalloc_exec can handle bpf programs, which uses 64 byte
aligned allocations), and modules, which uses PAGE_SIZE aligned
allocations.

[1] https://lore.kernel.org/bpf/Ys6cWUMHO8XwyYgr@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/
---
 include/linux/vmalloc.h |   4 +
 mm/nommu.c              |   7 ++
 mm/vmalloc.c            | 163 +++++++++++++++++++++++++++++++++-------
 3 files changed, 147 insertions(+), 27 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 096d48aa3437..691c02ffe3db 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -35,6 +35,8 @@ struct notifier_block;		/* in notifier.h */
 #define VM_DEFER_KMEMLEAK	0
 #endif
 
+#define VM_KERNEL_EXEC		0x00001000	/* kernel text mapped as RO+X */
+
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
@@ -154,6 +156,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
 		int node, const void *caller) __alloc_size(1);
 void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+void *vmalloc_exec(unsigned long size, unsigned long align) __alloc_size(1);
+void vfree_exec(const void *addr);
 
 extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
 extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
diff --git a/mm/nommu.c b/mm/nommu.c
index 9d7afc2d959e..11e0fc996006 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
 }
 EXPORT_SYMBOL(vm_map_pages_zero);
 
+void *vmalloc_exec(unsigned long size, unsigned long align)
+{
+	return NULL;
+}
+
+void vfree_exec(const void *addr) { }
+
 /*
  *  sys_brk() for the most part doesn't need the global kernel
  *  lock, except when an application is doing something nasty
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index effd1ff6a4b4..472287e71bf1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -753,6 +753,10 @@ static LIST_HEAD(free_vmap_area_list);
  */
 static struct rb_root free_vmap_area_root = RB_ROOT;
 
+static DEFINE_SPINLOCK(free_text_area_lock);
+static LIST_HEAD(free_text_area_list);
+static struct rb_root free_text_area_root = RB_ROOT;
+
 /*
  * Preload a CPU with one object for "no edge" split case. The
  * aim is to get rid of allocations from the atomic context, thus
@@ -814,9 +818,11 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
 	return va;
 }
 
-static struct vmap_area *__find_vmap_area(unsigned long addr)
+static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_node *root)
 {
-	struct rb_node *n = vmap_area_root.rb_node;
+	struct rb_node *n;
+
+	n = root ? root : vmap_area_root.rb_node;
 
 	addr = (unsigned long)kasan_reset_tag((void *)addr);
 
@@ -926,7 +932,7 @@ link_va(struct vmap_area *va, struct rb_root *root,
 
 	/* Insert to the rb-tree */
 	rb_link_node(&va->rb_node, parent, link);
-	if (root == &free_vmap_area_root) {
+	if (root == &free_vmap_area_root || root == &free_text_area_root) {
 		/*
 		 * Some explanation here. Just perform simple insertion
 		 * to the tree. We do not set va->subtree_max_size to
@@ -955,7 +961,7 @@ unlink_va(struct vmap_area *va, struct rb_root *root)
 	if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
 		return;
 
-	if (root == &free_vmap_area_root)
+	if (root == &free_vmap_area_root || root == &free_text_area_root)
 		rb_erase_augmented(&va->rb_node,
 			root, &free_vmap_area_rb_augment_cb);
 	else
@@ -1198,15 +1204,15 @@ is_within_this_va(struct vmap_area *va, unsigned long size,
  * overhead.
  */
 static __always_inline struct vmap_area *
-find_vmap_lowest_match(unsigned long size, unsigned long align,
-	unsigned long vstart, bool adjust_search_size)
+find_vmap_lowest_match(struct rb_node *root, unsigned long size,
+       unsigned long align, unsigned long vstart, bool adjust_search_size)
 {
 	struct vmap_area *va;
 	struct rb_node *node;
 	unsigned long length;
 
 	/* Start from the root. */
-	node = free_vmap_area_root.rb_node;
+	node = root;
 
 	/* Adjust the search size for alignment overhead. */
 	length = adjust_search_size ? size + align - 1 : size;
@@ -1290,8 +1296,9 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align)
 	get_random_bytes(&rnd, sizeof(rnd));
 	vstart = VMALLOC_START + rnd;
 
-	va_1 = find_vmap_lowest_match(size, align, vstart, false);
-	va_2 = find_vmap_lowest_linear_match(size, align, vstart);
+	va_1 = find_vmap_lowest_match(free_vmap_area_root.rb_node, size,
+				      align, vstart, false);
+	va_2 = find_vmap_lowest_linear_match(root, size, align, vstart);
 
 	if (va_1 != va_2)
 		pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
@@ -1334,7 +1341,8 @@ classify_va_fit_type(struct vmap_area *va,
 }
 
 static __always_inline int
-adjust_va_to_fit_type(struct vmap_area *va,
+adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
+	struct vmap_area *va,
 	unsigned long nva_start_addr, unsigned long size,
 	enum fit_type type)
 {
@@ -1348,7 +1356,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
 		 * V      NVA      V
 		 * |---------------|
 		 */
-		unlink_va(va, &free_vmap_area_root);
+		unlink_va(va, root);
 		kmem_cache_free(vmap_area_cachep, va);
 	} else if (type == LE_FIT_TYPE) {
 		/*
@@ -1426,8 +1434,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
 		augment_tree_propagate_from(va);
 
 		if (lva)	/* type == NE_FIT_TYPE */
-			insert_vmap_area_augment(lva, &va->rb_node,
-				&free_vmap_area_root, &free_vmap_area_list);
+			insert_vmap_area_augment(lva, &va->rb_node, root, head);
 	}
 
 	return 0;
@@ -1459,7 +1466,8 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
 	if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
 		adjust_search_size = false;
 
-	va = find_vmap_lowest_match(size, align, vstart, adjust_search_size);
+	va = find_vmap_lowest_match(free_vmap_area_root.rb_node,
+				    size, align, vstart, adjust_search_size);
 	if (unlikely(!va))
 		return vend;
 
@@ -1478,7 +1486,8 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
 		return vend;
 
 	/* Update the free vmap_area. */
-	ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
+	ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
+				    va, nva_start_addr, size, type);
 	if (ret)
 		return vend;
 
@@ -1539,7 +1548,7 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
 static struct vmap_area *alloc_vmap_area(unsigned long size,
 				unsigned long align,
 				unsigned long vstart, unsigned long vend,
-				int node, gfp_t gfp_mask)
+				int node, unsigned long vm_flags, gfp_t gfp_mask)
 {
 	struct vmap_area *va;
 	unsigned long freed;
@@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->va_end = addr + size;
 	va->vm = NULL;
 
-	spin_lock(&vmap_area_lock);
-	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
-	spin_unlock(&vmap_area_lock);
+	if (vm_flags & VM_KERNEL_EXEC) {
+		spin_lock(&free_text_area_lock);
+		insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
+		/* update subtree_max_size now as we need this soon */
+		augment_tree_propagate_from(va);
+		spin_unlock(&free_text_area_lock);
+	} else {
+		spin_lock(&vmap_area_lock);
+		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+		spin_unlock(&vmap_area_lock);
+	}
 
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
@@ -1803,7 +1820,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
 	struct vmap_area *va;
 
 	spin_lock(&vmap_area_lock);
-	va = __find_vmap_area(addr);
+	va = __find_vmap_area(addr, vmap_area_root.rb_node);
 	spin_unlock(&vmap_area_lock);
 
 	return va;
@@ -1912,8 +1929,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 		return ERR_PTR(-ENOMEM);
 
 	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
-					VMALLOC_START, VMALLOC_END,
-					node, gfp_mask);
+				     VMALLOC_START, VMALLOC_END,
+				     node, 0, gfp_mask);
 	if (IS_ERR(va)) {
 		kfree(vb);
 		return ERR_CAST(va);
@@ -2209,8 +2226,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
 		addr = (unsigned long)mem;
 	} else {
 		struct vmap_area *va;
-		va = alloc_vmap_area(size, PAGE_SIZE,
-				VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+		va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END,
+				     node, 0, GFP_KERNEL);
 		if (IS_ERR(va))
 			return NULL;
 
@@ -2450,7 +2467,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 	if (!(flags & VM_NO_GUARD))
 		size += PAGE_SIZE;
 
-	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
+	va = alloc_vmap_area(size, align, start, end, node, flags, gfp_mask);
 	if (IS_ERR(va)) {
 		kfree(area);
 		return NULL;
@@ -2546,7 +2563,7 @@ struct vm_struct *remove_vm_area(const void *addr)
 	might_sleep();
 
 	spin_lock(&vmap_area_lock);
-	va = __find_vmap_area((unsigned long)addr);
+	va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
 	if (va && va->vm) {
 		struct vm_struct *vm = va->vm;
 
@@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc);
 
+void *vmalloc_exec(unsigned long size, unsigned long align)
+{
+	struct vmap_area *va, *tmp;
+	unsigned long addr;
+	enum fit_type type;
+	int ret;
+
+	va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE);
+	if (unlikely(!va))
+		return ERR_PTR(-ENOMEM);
+
+again:
+	preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE);
+	tmp = find_vmap_lowest_match(free_text_area_root.rb_node,
+				     size, align, 1, false);
+
+	if (!tmp) {
+		unsigned long alloc_size;
+		void *ptr;
+
+		spin_unlock(&free_text_area_lock);
+
+		alloc_size = roundup(size, PMD_SIZE * num_online_nodes());
+		ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR,
+					   MODULES_END, GFP_KERNEL, PAGE_KERNEL,
+					   VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD,
+					   NUMA_NO_NODE, __builtin_return_address(0));
+		if (unlikely(!ptr)) {
+			ret = -ENOMEM;
+			goto err_out;
+		}
+		memset(ptr, 0, alloc_size);
+		set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
+		set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
+
+		goto again;
+	}
+
+	addr = roundup(tmp->va_start, align);
+	type = classify_va_fit_type(tmp, addr, size);
+	if (WARN_ON_ONCE(type == NOTHING_FIT)) {
+		addr = -ENOMEM;
+		goto err_out;
+	}
+
+	ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list,
+				    tmp, addr, size, type);
+	if (ret) {
+		addr = ret;
+		goto err_out;
+	}
+	spin_unlock(&free_text_area_lock);
+
+	va->va_start = addr;
+	va->va_end = addr + size;
+	va->vm = tmp->vm;
+
+	spin_lock(&vmap_area_lock);
+	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+	spin_unlock(&vmap_area_lock);
+
+	return (void *)addr;
+
+err_out:
+	spin_unlock(&free_text_area_lock);
+	return ERR_PTR(ret);
+}
+
+void vfree_exec(const void *addr)
+{
+	struct vmap_area *va;
+
+	might_sleep();
+
+	spin_lock(&vmap_area_lock);
+	va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
+	if (WARN_ON_ONCE(!va)) {
+		spin_unlock(&vmap_area_lock);
+		return;
+	}
+
+	unlink_va(va, &vmap_area_root);
+	spin_unlock(&vmap_area_lock);
+
+	spin_lock(&free_text_area_lock);
+	merge_or_add_vmap_area_augment(va,
+		&free_text_area_root, &free_text_area_list);
+	spin_unlock(&free_text_area_lock);
+	/* TODO: when the whole vm_struct is not in use, free it */
+}
+
 /**
  * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
  * @size:      allocation size
@@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 			/* It is a BUG(), but trigger recovery instead. */
 			goto recovery;
 
-		ret = adjust_va_to_fit_type(va, start, size, type);
+		ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
+					    va, start, size, type);
 		if (unlikely(ret))
 			goto recovery;
 
-- 
2.30.2






[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux