Hello, Steven. On Thu, Jan 17, 2013 at 07:28:44PM -0500, Steven Rostedt wrote: > In slab_alloc_node(), after the cpu_slab is assigned, if the task is > preempted and moves to another CPU, there's nothing keeping the page and > object in sync. The -rt kernel crashed because page was NULL and object > was not, and the node_match() dereferences page. Even though the crash > happened on -rt, there's nothing that's keeping this from happening on > mainline. > > The easiest fix is to disable interrupts for the entire time from > acquiring the current CPU cpu_slab and assigning the object and page. > After that, it's fine to allow preemption. How about this? It's based on v3.8-rc3. I'm not test this patch, yet. Just for sharing my idea to fix a problem. -----------------8<----------------------- >From aaf529e347b7deb8c35dd484abc9166a5d3c0629 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx> Date: Fri, 18 Jan 2013 13:10:33 +0900 Subject: [RFC PATCH] slub: change an approach of checking node id in slab_alloc_node() Below description borrows from Steven's "[RFC]slub: Keep page and object in sync in slab_alloc_node()" In slab_alloc_node(), after the cpu_slab is assigned, if the task is preempted and moves to another CPU, there's nothing keeping the page and object in sync. The -rt kernel crashed because page was NULL and object was not, and the node_match() dereferences page. Even though the crash happened on -rt, there's nothing that's keeping this from happening on mainline. To fix this, I add new variable, node, to struct kmem_cache_cpu. With it, we can safely compare current cpu slab's node with node of allocation request without considering both preemption and irq in fast-path of allocation. This variable is set in slow-path, so this doesn't make much performance degration. Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx> diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 9db4825..b54dffa 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -46,6 +46,9 @@ enum stat_item { struct kmem_cache_cpu { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ +#ifdef CONFIG_NUMA + int node; +#endif struct page *page; /* The slab from which we are allocating */ struct page *partial; /* Partially allocated frozen slabs */ #ifdef CONFIG_SLUB_STATS diff --git a/mm/slub.c b/mm/slub.c index ba2ca53..d4d3d07 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -272,6 +272,17 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) *(void **)(object + s->offset) = fp; } +static inline void set_cpuslab(struct kmem_cache_cpu *c, struct page *page) +{ + c->page = page; +#ifdef CONFIG_NUMA + if (page) + c->node = page_to_nid(page); + else + c->node = NUMA_NO_NODE; +#endif +} + /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ @@ -1562,7 +1573,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, break; if (!object) { - c->page = page; + set_cpuslab(c, page); stat(s, ALLOC_FROM_PARTIAL); object = t; available = page->objects - page->inuse; @@ -1993,7 +2004,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) deactivate_slab(s, c->page, c->freelist); c->tid = next_tid(c->tid); - c->page = NULL; + set_cpuslab(c, NULL); c->freelist = NULL; } @@ -2038,10 +2049,10 @@ static void flush_all(struct kmem_cache *s) * Check if the objects in a per cpu structure fit numa * locality expectations. */ -static inline int node_match(struct page *page, int node) +static inline int node_match(struct kmem_cache_cpu *c, int node) { #ifdef CONFIG_NUMA - if (node != NUMA_NO_NODE && page_to_nid(page) != node) + if (node != NUMA_NO_NODE && c->node != node) return 0; #endif return 1; @@ -2136,7 +2147,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, page->freelist = NULL; stat(s, ALLOC_SLAB); - c->page = page; + set_cpuslab(c, page); *pc = c; } else freelist = NULL; @@ -2224,10 +2235,10 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto new_slab; redo: - if (unlikely(!node_match(page, node))) { + if (unlikely(!node_match(c, node))) { stat(s, ALLOC_NODE_MISMATCH); deactivate_slab(s, page, c->freelist); - c->page = NULL; + set_cpuslab(c, NULL); c->freelist = NULL; goto new_slab; } @@ -2239,7 +2250,7 @@ redo: */ if (unlikely(!pfmemalloc_match(page, gfpflags))) { deactivate_slab(s, page, c->freelist); - c->page = NULL; + set_cpuslab(c, NULL); c->freelist = NULL; goto new_slab; } @@ -2254,7 +2265,7 @@ redo: freelist = get_freelist(s, page); if (!freelist) { - c->page = NULL; + set_cpuslab(c, NULL); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -2276,8 +2287,8 @@ load_freelist: new_slab: if (c->partial) { - page = c->page = c->partial; - c->partial = page->next; + set_cpuslab(c, c->partial); + c->partial = c->partial->next; stat(s, CPU_PARTIAL_ALLOC); c->freelist = NULL; goto redo; @@ -2302,7 +2313,7 @@ new_slab: goto new_slab; /* Slab failed checks. Next slab needed */ deactivate_slab(s, page, get_freepointer(s, freelist)); - c->page = NULL; + set_cpuslab(c, NULL); c->freelist = NULL; local_irq_restore(flags); return freelist; @@ -2323,7 +2334,6 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, { void **object; struct kmem_cache_cpu *c; - struct page *page; unsigned long tid; if (slab_pre_alloc_hook(s, gfpflags)) @@ -2350,8 +2360,7 @@ redo: barrier(); object = c->freelist; - page = c->page; - if (unlikely(!object || !node_match(page, node))) + if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { -- 1.7.9.5 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>