Re: [PATCH v9 bpf-next 1/9] x86/Kconfig: select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP

"Edgecombe, Rick P" <rick.p.edgecombe@xxxxxxxxx> · Tue, 29 Mar 2022 18:39:49 +0000

CC some vmalloc folks. What happened was vmalloc huge pages was turned
on for x86 with the rest of the kernel unprepared and problems have
popped up. We are discussing a possible new config and vmap flag such
that for some arch's, huge pages would only be used for certain
allocations such as BPF's.

Thread:

https://lore.kernel.org/lkml/6080EC28-E3FE-4B00-B94A-ED7EBA1F55ED@xxxxxx/

On Tue, 2022-03-29 at 08:23 +0000, Song Liu wrote:
> > On Mar 28, 2022, at 5:18 PM, Edgecombe, Rick P <
> > rick.p.edgecombe@xxxxxxxxx> wrote:
> > 
> > On Mon, 2022-03-28 at 23:27 +0000, Song Liu wrote:
> > > I like this direction. But I am afraid this is not enough. Using
> > > VM_NO_HUGE_VMAP in module_alloc() will make sure we don't
> > > allocate 
> > > huge pages for modules. But other users of
> > > __vmalloc_node_range(), 
> > > such as vzalloc in Paul's report, may still hit the issue. 
> > > 
> > > Maybe we need another flag VM_FORCE_HUGE_VMAP that bypasses 
> > > vmap_allow_huge check. Something like the diff below.
> > > 
> > > Would this work?
> > 
> > Yea, that looks like a safer direction. It's too bad we can't have
> > automatic large pages, but it doesn't seem ready to just turn on
> > for
> > the whole x86 kernel.
> > 
> > I'm not sure about this implementation though. It would let large
> > pages
> > get enabled without HAVE_ARCH_HUGE_VMALLOC and also despite the
> > disable
> > kernel parameter.
> > 
> > Apparently some architectures can handle large pages automatically
> > and
> > it has benefits for them, so maybe vmalloc should support both
> > behaviors based on config. Like there should a
> > ARCH_HUGE_VMALLOC_REQUIRE_FLAG config. If configured it requires
> > VM_HUGE_VMAP (or some name). I don't think FORCE fits, because the
> > current logic would not always give huge pages.
> > 
> > But yea, seems risky to leave it on generally, even if you could
> > fix
> > Paul's specific issue.
> > 
> 
> 
> How about something like the following? (I still need to fix
> something, but
> would like some feedbacks on the API).
> 
> Thanks,
> Song
> 
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 84bc1de02720..defef50ff527 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -858,6 +858,15 @@ config HAVE_ARCH_HUGE_VMALLOC
>  	depends on HAVE_ARCH_HUGE_VMAP
>  	bool
>  
> +#
> +# HAVE_ARCH_HUGE_VMALLOC_FLAG allows users of __vmalloc_node_range
> to allocate
> +# huge page without HAVE_ARCH_HUGE_VMALLOC. To allocate huge pages,,
> the user
> +# need to call __vmalloc_node_range with VM_PREFER_HUGE_VMAP.
> +#
> +config HAVE_ARCH_HUGE_VMALLOC_FLAG
> +	depends on HAVE_ARCH_HUGE_VMAP
> +	bool
> +
>  config ARCH_WANT_HUGE_PMD_SHARE
>  	bool
>  
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 7340d9f01b62..e64f00415575 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -161,7 +161,7 @@ config X86
>  	select HAVE_ALIGNED_STRUCT_PAGE		if SLUB
>  	select HAVE_ARCH_AUDITSYSCALL
>  	select HAVE_ARCH_HUGE_VMAP		if X86_64 || X86_PAE
> -	select HAVE_ARCH_HUGE_VMALLOC		if X86_64
> +	select HAVE_ARCH_HUGE_VMALLOC_FLAG	if X86_64
>  	select HAVE_ARCH_JUMP_LABEL
>  	select HAVE_ARCH_JUMP_LABEL_RELATIVE
>  	select HAVE_ARCH_KASAN			if X86_64
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 3b1df7da402d..98f8a93fcfd4 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -35,6 +35,11 @@ struct notifier_block;		/* in
> notifier.h */
>  #define VM_DEFER_KMEMLEAK	0
>  #endif
>  
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC_FLAG
> +#define VM_PREFER_HUGE_VMAP	0x00001000	/* prefer PMD_SIZE
> mapping (bypass vmap_allow_huge check) */

I don't think we should tie this to vmap_allow_huge by definition.
Also, what it does is try 2MB pages for allocations larger than 2MB.
For smaller allocations it doesn't try or "prefer" them.

> +#else
> +#define VM_PREFER_HUGE_VMAP	0
> +#endif
>  /* bits [20..32] reserved for arch specific ioremap internals */
>  
>  /*
> @@ -51,7 +56,7 @@ struct vm_struct {
>  	unsigned long		size;
>  	unsigned long		flags;
>  	struct page		**pages;
> -#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +#if (defined(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
> defined(CONFIG_HAVE_ARCH_HUGE_VMALLOC_FLAG))
>  	unsigned int		page_order;
>  #endif
>  	unsigned int		nr_pages;
> @@ -225,7 +230,7 @@ static inline bool is_vm_area_hugepages(const
> void *addr)
>  	 * prevents that. This only indicates the size of the physical
> page
>  	 * allocated in the vmalloc layer.
>  	 */
> -#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +#if (defined(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
> defined(CONFIG_HAVE_ARCH_HUGE_VMALLOC_FLAG))
>  	return find_vm_area(addr)->page_order > 0;
>  #else
>  	return false;
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 13e9dbeeedf3..fc9dae095079 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -851,13 +851,28 @@ static LIST_HEAD(pack_list);
>  #define BPF_HPAGE_MASK PAGE_MASK
>  #endif
>  
> +static void *bpf_prog_pack_vmalloc(unsigned long size)
> +{
> +#if defined(MODULES_VADDR)
> +	unsigned long start = MODULES_VADDR;
> +	unsigned long end = MODULES_END;
> +#else
> +	unsigned long start = VMALLOC_START;
> +	unsigned long end = VMALLOC_END;
> +#endif
> +
> +	return __vmalloc_node_range(size, PAGE_SIZE, start, end,
> GFP_KERNEL, PAGE_KERNEL,
> +				    VM_DEFER_KMEMLEAK |
> VM_PREFER_HUGE_VMAP,
> +				    NUMA_NO_NODE,
> __builtin_return_address(0));
> +}
> +
>  static size_t select_bpf_prog_pack_size(void)
>  {
>  	size_t size;
>  	void *ptr;
>  
>  	size = BPF_HPAGE_SIZE * num_online_nodes();
> -	ptr = module_alloc(size);
> +	ptr = bpf_prog_pack_vmalloc(size);
>  
>  	/* Test whether we can get huge pages. If not just use
> PAGE_SIZE
>  	 * packs.
> @@ -881,7 +896,7 @@ static struct bpf_prog_pack *alloc_new_pack(void)
>  		       GFP_KERNEL);
>  	if (!pack)
>  		return NULL;
> -	pack->ptr = module_alloc(bpf_prog_pack_size);
> +	pack->ptr = bpf_prog_pack_vmalloc(bpf_prog_pack_size);
>  	if (!pack->ptr) {
>  		kfree(pack);
>  		return NULL;
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index e163372d3967..9d3c1ab8bdf1 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2261,7 +2261,7 @@ static inline unsigned int
> vm_area_page_order(struct vm_struct *vm)
>  
>  static inline void set_vm_area_page_order(struct vm_struct *vm,
> unsigned int order)
>  {
> -#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +#if (defined(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
> defined(CONFIG_HAVE_ARCH_HUGE_VMALLOC_FLAG))
>  	vm->page_order = order;
>  #else
>  	BUG_ON(order != 0);
> @@ -3106,7 +3106,8 @@ void *__vmalloc_node_range(unsigned long size,
> unsigned long align,
>  		return NULL;
>  	}
>  
> -	if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) {
> +	if ((vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) ||
> +	    (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC_FLAG) &&
> (vm_flags & VM_PREFER_HUGE_VMAP))) {

vmap_allow_huge is how the kernel parameter that can disable vmalloc
huge pages works. I don't think we should separate it. Disabling
vmalloc huge pages should still disable this alternate mode.

>  		unsigned long size_per_node;
>  
>  		/*