[RFC 4/5] vmalloc_exec: share a huge page with kernel text

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On x86 kernel, we allocate 2MB pages for kernel text up to
round_down(_etext, 2MB). Therefore, some of the kernel text is still
on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
round_up(_etext, 2MB), and use the rest of the page for modules and
BPF programs.

Here is an example:

[root@eth50-1 ~]# grep _etext /proc/kallsyms
ffffffff82202a08 T _etext

[root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]

[root@eth50-1 ~]#  grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel
0xffffffff82200000-0xffffffff82400000     2M     ro   PSE         x  pmd

[root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
ffffffff822ba910 t xfs_flush_inodes_worker      [xfs]
ffffffff822bc580 t xfs_flush_inodes     [xfs]

ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, xfs
module, and bpf programs.
---
 arch/x86/mm/init_64.c |  3 ++-
 mm/vmalloc.c          | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 39c5246964a9..d27d0af5beb5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1367,12 +1367,13 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
 
 int kernel_set_to_readonly;
 
+#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
 	unsigned long end = (unsigned long)__end_rodata_hpage_align;
-	unsigned long text_end = PFN_ALIGN(_etext);
+	unsigned long text_end = PMD_ALIGN(_etext);
 	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
 	unsigned long all_end;
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 472287e71bf1..5f3b5df9313f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -72,6 +72,11 @@ early_param("nohugevmalloc", set_nohugevmalloc);
 static const bool vmap_allow_huge = false;
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
 
+#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
+
+static struct vm_struct text_tail_vm;
+static struct vmap_area text_tail_va;
+
 bool is_vmalloc_addr(const void *x)
 {
 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
@@ -634,6 +639,8 @@ int is_vmalloc_or_module_addr(const void *x)
 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
 	if (addr >= MODULES_VADDR && addr < MODULES_END)
 		return 1;
+	if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end)
+		return 1;
 #endif
 	return is_vmalloc_addr(x);
 }
@@ -2371,6 +2378,25 @@ static void vmap_init_free_space(void)
 	}
 }
 
+static void register_text_tail_vm(void)
+{
+	unsigned long start = PFN_ALIGN(_etext);
+	unsigned long end = PMD_ALIGN(_etext);
+	struct vmap_area *va;
+
+	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+	if (WARN_ON_ONCE(!va))
+		return;
+	text_tail_vm.addr = (void *)start;
+	text_tail_vm.size = end - start;
+	text_tail_vm.flags = VM_KERNEL_EXEC;
+	text_tail_va.va_start = start;
+	text_tail_va.va_end = end;
+	text_tail_va.vm = &text_tail_vm;
+	memcpy(va, &text_tail_va, sizeof(*va));
+	insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
+}
+
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;
@@ -2381,6 +2407,7 @@ void __init vmalloc_init(void)
 	 * Create the cache for vmap_area objects.
 	 */
 	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
+	register_text_tail_vm();
 
 	for_each_possible_cpu(i) {
 		struct vmap_block_queue *vbq;
-- 
2.30.2






[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux