Re: [S+Q Cleanup2 4/6] slub: Dynamically size kmalloc cache allocations

David Rientjes <rientjes@xxxxxxxxxx> · Wed, 18 Aug 2010 14:11:03 -0700 (PDT)

On Wed, 18 Aug 2010, Christoph Lameter wrote:

> Index: linux-2.6/include/linux/slub_def.h
> ===================================================================
> --- linux-2.6.orig/include/linux/slub_def.h	2010-08-17 11:38:48.000000000 -0500
> +++ linux-2.6/include/linux/slub_def.h	2010-08-17 14:45:54.000000000 -0500
> @@ -139,19 +139,16 @@ struct kmem_cache {
>  
>  #ifdef CONFIG_ZONE_DMA
>  #define SLUB_DMA __GFP_DMA
> -/* Reserve extra caches for potential DMA use */
> -#define KMALLOC_CACHES (2 * SLUB_PAGE_SHIFT)
>  #else
>  /* Disable DMA functionality */
>  #define SLUB_DMA (__force gfp_t)0
> -#define KMALLOC_CACHES SLUB_PAGE_SHIFT
>  #endif
>  
>  /*
>   * We keep the general caches in an array of slab caches that are used for
>   * 2^x bytes of allocations.
>   */
> -extern struct kmem_cache kmalloc_caches[KMALLOC_CACHES];
> +extern struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
>  
>  /*
>   * Sorry that the following has to be that ugly but some versions of GCC
> @@ -216,7 +213,7 @@ static __always_inline struct kmem_cache
>  	if (index == 0)
>  		return NULL;
>  
> -	return &kmalloc_caches[index];
> +	return kmalloc_caches[index];
>  }
>  
>  void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
> Index: linux-2.6/mm/slub.c
> ===================================================================
> --- linux-2.6.orig/mm/slub.c	2010-08-17 12:30:11.000000000 -0500
> +++ linux-2.6/mm/slub.c	2010-08-17 14:46:09.000000000 -0500
> @@ -178,7 +178,7 @@ static struct notifier_block slab_notifi
>  
>  static enum {
>  	DOWN,		/* No slab functionality available */
> -	PARTIAL,	/* kmem_cache_open() works but kmalloc does not */
> +	PARTIAL,	/* Kmem_cache_node works */

This isn't going to be needed anymore, even with the rest of your SLUB+Q 
patches, so it should probably be removed unless you can think of a future 
use.

>  	UP,		/* Everything works but does not show up in sysfs */
>  	SYSFS		/* Sysfs up */
>  } slab_state = DOWN;
> @@ -2073,6 +2073,8 @@ static inline int alloc_kmem_cache_cpus(
>  }
>  
>  #ifdef CONFIG_NUMA
> +static struct kmem_cache *kmem_cache_node;
> +
>  /*
>   * No kmalloc_node yet so do it by hand. We know that this is the first
>   * slab on the node for this slabcache. There are no concurrent accesses
> @@ -2088,9 +2090,9 @@ static void early_kmem_cache_node_alloc(
>  	struct kmem_cache_node *n;
>  	unsigned long flags;
>  
> -	BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
> +	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
>  
> -	page = new_slab(kmalloc_caches, GFP_NOWAIT, node);
> +	page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
>  
>  	BUG_ON(!page);
>  	if (page_to_nid(page) != node) {
> @@ -2102,15 +2104,15 @@ static void early_kmem_cache_node_alloc(
>  
>  	n = page->freelist;
>  	BUG_ON(!n);
> -	page->freelist = get_freepointer(kmalloc_caches, n);
> +	page->freelist = get_freepointer(kmem_cache_node, n);
>  	page->inuse++;
> -	kmalloc_caches->node[node] = n;
> +	kmem_cache_node->node[node] = n;
>  #ifdef CONFIG_SLUB_DEBUG
> -	init_object(kmalloc_caches, n, 1);
> -	init_tracking(kmalloc_caches, n);
> +	init_object(kmem_cache_node, n, 1);
> +	init_tracking(kmem_cache_node, n);
>  #endif
> -	init_kmem_cache_node(n, kmalloc_caches);
> -	inc_slabs_node(kmalloc_caches, node, page->objects);
> +	init_kmem_cache_node(n, kmem_cache_node);
> +	inc_slabs_node(kmem_cache_node, node, page->objects);
>  
>  	/*
>  	 * lockdep requires consistent irq usage for each lock
> @@ -2128,8 +2130,10 @@ static void free_kmem_cache_nodes(struct
>  
>  	for_each_node_state(node, N_NORMAL_MEMORY) {
>  		struct kmem_cache_node *n = s->node[node];
> +
>  		if (n)
> -			kmem_cache_free(kmalloc_caches, n);
> +			kmem_cache_free(kmem_cache_node, n);
> +
>  		s->node[node] = NULL;
>  	}
>  }
> @@ -2145,7 +2149,7 @@ static int init_kmem_cache_nodes(struct 
>  			early_kmem_cache_node_alloc(node);
>  			continue;
>  		}
> -		n = kmem_cache_alloc_node(kmalloc_caches,
> +		n = kmem_cache_alloc_node(kmem_cache_node,
>  						GFP_KERNEL, node);
>  
>  		if (!n) {
> @@ -2498,11 +2502,13 @@ EXPORT_SYMBOL(kmem_cache_destroy);
>   *		Kmalloc subsystem
>   *******************************************************************/
>  
> -struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
> +struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
>  EXPORT_SYMBOL(kmalloc_caches);
>  
> +static struct kmem_cache *kmem_cache;
> +
>  #ifdef CONFIG_ZONE_DMA
> -static struct kmem_cache kmalloc_dma_caches[SLUB_PAGE_SHIFT];
> +static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
>  #endif
>  
>  static int __init setup_slub_min_order(char *str)
> @@ -2541,9 +2547,13 @@ static int __init setup_slub_nomerge(cha
>  
>  __setup("slub_nomerge", setup_slub_nomerge);
>  
> -static void create_kmalloc_cache(struct kmem_cache *s,
> +static void __init create_kmalloc_cache(struct kmem_cache **sp,
>  		const char *name, int size, unsigned int flags)
>  {
> +	struct kmem_cache *s;
> +
> +	s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);

Needs BUG_ON(!s)?

> +
>  	/*
>  	 * This function is called with IRQs disabled during early-boot on
>  	 * single CPU so there's no need to take slub_lock here.
> @@ -2552,6 +2562,8 @@ static void create_kmalloc_cache(struct 
>  								flags, NULL))
>  		goto panic;
>  
> +	*sp = s;

Is there an advantage to doing this and not simply having the function 
return s (or NULL, on error) back to kmem_cache_init()?

> +
>  	list_add(&s->list, &slab_caches);
>  
>  	if (!sysfs_slab_add(s))
> @@ -2613,10 +2625,10 @@ static struct kmem_cache *get_slab(size_
>  
>  #ifdef CONFIG_ZONE_DMA
>  	if (unlikely((flags & SLUB_DMA)))
> -		return &kmalloc_dma_caches[index];
> +		return kmalloc_dma_caches[index];
>  
>  #endif
> -	return &kmalloc_caches[index];
> +	return kmalloc_caches[index];
>  }
>  
>  void *__kmalloc(size_t size, gfp_t flags)
> @@ -2940,46 +2952,114 @@ static int slab_memory_callback(struct n
>   *			Basic setup of slabs
>   *******************************************************************/
>  
> +/*
> + * Used for early kmem_cache structures that were allocated using
> + * the page allocator
> + */
> +
> +static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
> +{
> +	int node;
> +
> +	list_add(&s->list, &slab_caches);
> +	sysfs_slab_add(s);

We'll need some error handling here to at least emit a warning message 
that we're missing caches in sysfs.

> +	s->refcount = -1;
> +
> +	for_each_node(node) {

Only needs to iterate over N_NORMAL_MEMORY.

> +		struct kmem_cache_node *n = get_node(s, node);
> +		struct page *p;
> +
> +		if (n) {
> +			list_for_each_entry(p, &n->partial, lru)
> +				p->slab = s;
> +
> +#ifdef CONFIG_SLAB_DEBUG
> +			list_for_each_entry(p, &n->full, lru)
> +				p->slab = s;
> +#endif
> +		}
> +	}
> +}
> +
>  void __init kmem_cache_init(void)
>  {
>  	int i;
>  	int caches = 0;
> +	struct kmem_cache *temp_kmem_cache;
> +	int order;
>  
>  #ifdef CONFIG_NUMA
> +	struct kmem_cache *temp_kmem_cache_node;
> +	unsigned long kmalloc_size;
> +
> +	kmem_size = offsetof(struct kmem_cache, node) +
> +				nr_node_ids * sizeof(struct kmem_cache_node *);
> +
> +	/* Allocate two kmem_caches from the page allocator */
> +	kmalloc_size = ALIGN(kmem_size, cache_line_size());
> +	order = get_order(2 * kmalloc_size);
> +	kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
> +
>  	/*
>  	 * Must first have the slab cache available for the allocations of the
>  	 * struct kmem_cache_node's. There is special bootstrap code in
>  	 * kmem_cache_open for slab_state == DOWN.
>  	 */
> -	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
> -		sizeof(struct kmem_cache_node), 0);
> -	kmalloc_caches[0].refcount = -1;
> -	caches++;
> +	kmem_cache_node = (void *)kmem_cache + kmalloc_size;
> +
> +	kmem_cache_open(kmem_cache_node, "kmem_cache_node",
> +		sizeof(struct kmem_cache_node),
> +		0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
>  
>  	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
> +#else
> +	/* Allocate a single kmem_cache from the page allocator */
> +	kmem_size = sizeof(struct kmem_cache);
> +	order = get_order(kmem_size);

Should this be cacheline aligned?

> +	kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
>  #endif
>  
>  	/* Able to allocate the per node structures */
>  	slab_state = PARTIAL;
>  
> -	/* Caches that are not of the two-to-the-power-of size */
> -	if (KMALLOC_MIN_SIZE <= 32) {
> -		create_kmalloc_cache(&kmalloc_caches[1],
> -				"kmalloc-96", 96, 0);
> -		caches++;
> -	}
> -	if (KMALLOC_MIN_SIZE <= 64) {
> -		create_kmalloc_cache(&kmalloc_caches[2],
> -				"kmalloc-192", 192, 0);
> -		caches++;
> -	}
> +	temp_kmem_cache = kmem_cache;
> +	kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
> +		0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
> +	kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);

BUG_ON(!kmem_cache);

> +	memcpy(kmem_cache, temp_kmem_cache, kmem_size);

kmem_cache_bootstrap_fixup(kmem_cache) should be here and not later, 
right?

>  
> -	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
> -		create_kmalloc_cache(&kmalloc_caches[i],
> -			"kmalloc", 1 << i, 0);
> -		caches++;
> -	}
> +#ifdef CONFIG_NUMA
> +	/*
> +	 * Allocate kmem_cache_node properly from the kmem_cache slab.
> +	 * kmem_cache_node is separately allocated so no need to
> +	 * update any list pointers.
> +	 */
> +	temp_kmem_cache_node = kmem_cache_node;
>  
> +	kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);

BUG_ON(!kmem_cache_node);

> +	memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
> +
> +	kmem_cache_bootstrap_fixup(kmem_cache_node);
> +
> +	caches++;
> +#else
> +	/*
> +	 * kmem_cache has kmem_cache_node embedded and we moved it!
> +	 * Update the list heads
> +	 */
> +	INIT_LIST_HEAD(&kmem_cache->local_node.partial);
> +	list_splice(&temp_kmem_cache->local_node.partial, &kmem_cache->local_node.partial);
> +#ifdef CONFIG_SLUB_DEBUG
> +	INIT_LIST_HEAD(&kmem_cache->local_node.full);
> +	list_splice(&temp_kmem_cache->local_node.full, &kmem_cache->local_node.full);
> +#endif
> +#endif
> +	kmem_cache_bootstrap_fixup(kmem_cache);
> +	caches++;
> +	/* Free temporary boot structure */
> +	free_pages((unsigned long)temp_kmem_cache, order);
> +
> +	/* Now we can use the kmem_cache to allocate kmalloc slabs */
>  
>  	/*
>  	 * Patch up the size_index table if we have strange large alignment
> @@ -3019,6 +3099,25 @@ void __init kmem_cache_init(void)
>  			size_index[size_index_elem(i)] = 8;
>  	}
>  
> +	/* Caches that are not of the two-to-the-power-of size */
> +	if (KMALLOC_MIN_SIZE <= 32) {
> +		create_kmalloc_cache(&kmalloc_caches[1],
> +				"kmalloc-96", 96, 0);
> +		caches++;
> +	}
> +
> +	if (KMALLOC_MIN_SIZE <= 64) {
> +		create_kmalloc_cache(&kmalloc_caches[2],
> +				"kmalloc-192", 192, 0);
> +		caches++;
> +	}
> +
> +	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
> +		create_kmalloc_cache(&kmalloc_caches[i],
> +			"kmalloc", 1 << i, 0);
> +		caches++;
> +	}
> +
>  	slab_state = UP;
>  
>  	/* Provide the correct kmalloc names now that the caches are up */
> @@ -3026,24 +3125,18 @@ void __init kmem_cache_init(void)
>  		char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
>  
>  		BUG_ON(!s);
> -		kmalloc_caches[i].name = s;
> +		kmalloc_caches[i]->name = s;
>  	}
>  
>  #ifdef CONFIG_SMP
>  	register_cpu_notifier(&slab_notifier);
>  #endif
> -#ifdef CONFIG_NUMA
> -	kmem_size = offsetof(struct kmem_cache, node) +
> -				nr_node_ids * sizeof(struct kmem_cache_node *);
> -#else
> -	kmem_size = sizeof(struct kmem_cache);
> -#endif
>  
>  #ifdef CONFIG_ZONE_DMA
> -	for (i = 1; i < SLUB_PAGE_SHIFT; i++) {
> -		struct kmem_cache *s = &kmalloc_caches[i];
> +	for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
> +		struct kmem_cache *s = kmalloc_caches[i];
>  
> -		if (s->size) {
> +		if (s && s->size) {
>  			char *name = kasprintf(GFP_NOWAIT,
>  				 "dma-kmalloc-%d", s->objsize);
>  
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>