Hi Andrew, On Thu, 2010-11-11 at 14:25 -0800, Andrew Morton wrote: > > So do you think we should change all that? > > Oh God, what have you done :( > > No, I don't think we want to add a gfp_t to all of that code to fix one > stupid bug in vmalloc(). > > > Or do you prefer the per-task mask? Or maybe even both? :-) > > Right now I'm thinking that the thing to do is to do the > pass-gfp_t-via-task_struct thing. I have attached my first attempt to fix this in the easiest way I could think of. Please note that the code is untested at the moment (I didn't even try to compile it yet) :-) I would like to test it, but I would also like to get your feedback first to make sure that I'm going in the right direction, at least. I'm not sure if at this point in time we want to do what we discussed before, e.g., making sure that the entire kernel uses this per-thread mask whenever it switches context, or whenever it crosses into the mm code boundary. For now, and at least for us, I think my patch would suffice to fix the vmalloc problem and additionally, we can also use the new per-thread gfp_mask API to have a much better guarantee that our I/O threads never allocate memory with __GFP_IO. Please let me know what you think. Thanks, Ricardo
>From c3cb300a9c7d6e540312d0ae75ec869cb83e853d Mon Sep 17 00:00:00 2001 From: Ricardo M. Correia <ricardo.correia@xxxxxxxxxx> Date: Mon, 15 Nov 2010 17:45:29 +0100 Subject: [PATCH] Fix __vmalloc() to always respect the gfp flags that the caller asked for. When __vmalloc() / __vmalloc_area_node() calls map_vm_area(), the latter can allocate pages with GFP_KERNEL despite the caller of __vmalloc having requested a more strict gfp mask. We fix this by introducing a per-thread gfp_mask, similar to gfp_allowed_mask but which only applies to the current thread. __vmalloc_area_node() will now temporarily restrict the per-thread gfp_mask when it calls map_vm_area(). This new per-thread gfp mask may also be used for other useful purposes, for example, after thread creation, to make sure that certain threads (e.g. filesystem I/O threads) never allocate memory with certain flags (e.g. __GFP_FS or __GFP_IO). --- include/linux/gfp.h | 13 +++++++++++++ include/linux/init_task.h | 1 + include/linux/sched.h | 1 + mm/page_alloc.c | 2 ++ mm/slab.c | 3 +++ mm/slub.c | 4 ++++ mm/vmalloc.c | 20 +++++++++++++++++++- 7 files changed, 43 insertions(+), 1 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index e8713d5..22553d1 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -6,6 +6,8 @@ #include <linux/linkage.h> #include <linux/topology.h> #include <linux/mmdebug.h> +#include <linux/sched.h> +#include <asm/current.h> struct vm_area_struct; @@ -363,4 +365,15 @@ extern gfp_t gfp_allowed_mask; extern void set_gfp_allowed_mask(gfp_t mask); extern gfp_t clear_gfp_allowed_mask(gfp_t mask); +/* Per-thread allowed gfp mask */ +static inline gfp_t get_thread_gfp_mask() +{ + return get_current()->gfp_mask; +} + +static inline set_thread_gfp_mask(gfp_t mask) +{ + get_current()->gfp_mask = mask; +} + #endif /* __LINUX_GFP_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 1f8c06c..516429b 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -120,6 +120,7 @@ extern struct cred init_cred; .stack = &init_thread_info, \ .usage = ATOMIC_INIT(2), \ .flags = PF_KTHREAD, \ + .gfp_mask = ~0, \ .lock_depth = -1, \ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index d0036e5..6655361 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1183,6 +1183,7 @@ struct task_struct { void *stack; atomic_t usage; unsigned int flags; /* per process flags, defined below */ + gfp_t gfp_mask; /* per process mask of allowed gfp flags */ unsigned int ptrace; int lock_depth; /* BKL lock depth */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07a6544..a3a47c7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,7 @@ #include <linux/compaction.h> #include <trace/events/kmem.h> #include <linux/ftrace_event.h> +#include <linux/gfp.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -2128,6 +2129,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); gfp_mask &= gfp_allowed_mask; + gfp_mask &= get_thread_gfp_mask(); lockdep_trace_alloc(gfp_mask); diff --git a/mm/slab.c b/mm/slab.c index b1e40da..7e1ddc7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -115,6 +115,7 @@ #include <linux/debugobjects.h> #include <linux/kmemcheck.h> #include <linux/memory.h> +#include <linux/gfp.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -3391,6 +3392,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, int slab_node = numa_mem_id(); flags &= gfp_allowed_mask; + flags &= get_thread_gfp_mask(); lockdep_trace_alloc(flags); @@ -3476,6 +3478,7 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) void *objp; flags &= gfp_allowed_mask; + flags &= get_thread_gfp_mask(); lockdep_trace_alloc(flags); diff --git a/mm/slub.c b/mm/slub.c index 8fd5401..36bcf05 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -27,6 +27,7 @@ #include <linux/memory.h> #include <linux/math64.h> #include <linux/fault-inject.h> +#include <linux/gfp.h> /* * Lock order: @@ -789,6 +790,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; + flags &= get_thread_gfp_mask(); lockdep_trace_alloc(flags); might_sleep_if(flags & __GFP_WAIT); @@ -798,6 +800,7 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) { flags &= gfp_allowed_mask; + flags &= get_thread_gfp_mask(); kmemcheck_slab_alloc(s, flags, object, s->objsize); kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); } @@ -1691,6 +1694,7 @@ new_slab: } gfpflags &= gfp_allowed_mask; + gfpflags &= get_thread_gfp_mask(); if (gfpflags & __GFP_WAIT) local_irq_enable(); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3d66b3..bd56a34 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -26,6 +26,7 @@ #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> +#include <linux/gfp.h> #include <asm/atomic.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> @@ -1484,7 +1485,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, { struct page **pages; unsigned int nr_pages, array_size, i; + int ret; gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + gfp_t thr_mask, saved_thr_mask; nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1522,7 +1525,22 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->pages[i] = page; } - if (map_vm_area(area, prot, &pages)) + /* + * map_vm_area() can allocate pages with GFP_KERNEL even though our + * caller requested a more strict gfp_mask. Therefore we have to + * prevent this by temporarily setting a more strict per-thread + * gfp_mask. + */ + saved_thr_mask = get_thread_gfp_mask(); + thr_mask = saved_thr_mask & gfp_mask; + set_thread_gfp_mask(thr_mask); + + ret = map_vm_area(area, prot, &pages); + + BUG_ON(get_thread_gfp_mask() != thr_mask); + set_thread_gfp_mask(saved_thr_mask); + + if (ret != 0) goto fail; return area->addr; -- 1.7.1