A quick queue-based memory pool, that functions as a cache in-front of kmem_cache SLAB/SLUB allocators. Which allows faster than SLAB/SLUB reuse/caching of fixed size memory elements The speed gain comes from, the shared storage, using a Lock-Free queue that supports bulk refilling elements (to a percpu cache) with a single cmpxchg. Thus, the (lock-prefixed) cmpxchg cost is amortize over the bulk size. Qmempool cannot easily replace all kmem_cache usage, because it is restricted in which contexts is can be used in, as the Lock-Free queue is not preemption safe. E.g. only supports GFP_ATOMIC allocations from SLAB. This version is optimized for usage from softirq context, and cannot be used from hardirq context. Usage from none-softirq requires usage of local_bh_{disable,enable}, which have a fairly high cost. Performance micro benchmarks against SLUB. First test is fast-path reuse of same element. Second test is allocating 256 element before freeing elements again, this pattern comes from how NIC ring queue cleanups often run. On CPU E5-2695, CONFIG_PREEMPT=y, showing cost of alloc+free: SLUB - softirq - none-softirq fastpath-reuse: 19.563 ns - 7.837 ns - 18.536 ns N(256)-pattern: 45.039 ns - 11.782 ns - 24.186 ns A significant win for usage from softirq, and a smaller win for none-softirq which requires taking local_bh_{disable,enable}. Signed-off-by: Jesper Dangaard Brouer <brouer@xxxxxxxxxx> --- include/linux/qmempool.h | 205 +++++++++++++++++++++++++++++ mm/Kconfig | 12 ++ mm/Makefile | 1 mm/qmempool.c | 322 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 540 insertions(+), 0 deletions(-) create mode 100644 include/linux/qmempool.h create mode 100644 mm/qmempool.c diff --git a/include/linux/qmempool.h b/include/linux/qmempool.h new file mode 100644 index 0000000..922ed27 --- /dev/null +++ b/include/linux/qmempool.h @@ -0,0 +1,205 @@ +/* + * qmempool - a quick queue based mempool + * + * A quick queue-based memory pool, that functions as a cache in-front + * of kmem_cache SLAB/SLUB allocators. Which allows faster than + * SLAB/SLUB reuse/caching of fixed size memory elements + * + * The speed gain comes from, the shared storage, using a Lock-Free + * queue that supports bulk refilling elements (to a percpu cache) + * with a single cmpxchg. Thus, the lock-prefixed cmpxchg cost is + * amortize over the bulk size. + * + * The Lock-Free queue is based on an array (of pointer to elements). + * This make access more cache optimal, as e.g. on 64bit 8 pointers + * can be stored per cache-line (which is superior to a linked list + * approach). Only storing the pointers to elements, is also + * beneficial as we don't touch the elements data. + * + * Qmempool cannot easily replace all kmem_cache usage, because it is + * restricted in which contexts is can be used in, as the Lock-Free + * queue is not preemption safe. This version is optimized for usage + * from softirq context, and cannot be used from hardirq context. + * + * Only support GFP_ATOMIC allocations from SLAB. + * + * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer + * for licensing details see kernel-base/COPYING + */ + +#ifndef _LINUX_QMEMPOOL_H +#define _LINUX_QMEMPOOL_H + +#include <linux/alf_queue.h> +#include <linux/prefetch.h> +#include <linux/hardirq.h> + +/* Bulking is an essential part of the performance gains as this + * amortize the cost of cmpxchg ops used when accessing sharedq + */ +#define QMEMPOOL_BULK 16 +#define QMEMPOOL_REFILL_MULTIPLIER 2 + +struct qmempool_percpu { + struct alf_queue *localq; +}; + +struct qmempool { + /* The shared queue (sharedq) is a Multi-Producer-Multi-Consumer + * queue where access is protected by an atomic cmpxchg operation. + * The queue support bulk transfers, which amortize the cost + * of the atomic cmpxchg operation. + */ + struct alf_queue *sharedq; + + /* Per CPU local "cache" queues for faster atomic free access. + * The local queues (localq) are Single-Producer-Single-Consumer + * queues as they are per CPU. + */ + struct qmempool_percpu __percpu *percpu; + + /* Backed by some SLAB kmem_cache */ + struct kmem_cache *kmem; + + /* Setup */ + uint32_t prealloc; + gfp_t gfp_mask; +}; + +extern void qmempool_destroy(struct qmempool *pool); +extern struct qmempool *qmempool_create( + uint32_t localq_sz, uint32_t sharedq_sz, uint32_t prealloc, + struct kmem_cache *kmem, gfp_t gfp_mask); + +extern void *__qmempool_alloc_from_sharedq( + struct qmempool *pool, gfp_t gfp_mask, struct alf_queue *localq); +extern void __qmempool_free_to_sharedq(void *elem, struct qmempool *pool, + struct alf_queue *localq); + +/* The percpu variables (SPSC queues) needs preempt protection, and + * the shared MPMC queue also needs protection against the same CPU + * access the same queue. + * + * Specialize and optimize the qmempool to run from softirq. + * Don't allow qmempool to be used from interrupt context. + * + * IDEA: When used from softirq, take advantage of the protection + * softirq gives. A softirq will never preempt another softirq, + * running on the same CPU. The only event that can preempt a softirq + * is an interrupt handler (and perhaps we don't need to support + * calling qmempool from an interrupt). Another softirq, even the + * same one, can run on another CPU however, but these helpers are + * only protecting our percpu variables. + * + * Thus, our percpu variables are safe if current the CPU is the one + * serving the softirq (tested via in_serving_softirq()), like: + * + * if (!in_serving_softirq()) + * local_bh_disable(); + * + * This makes qmempool very fast, when accesses from softirq, but + * slower when accessed outside softirq. The other contexts need to + * disable bottom-halves "bh" via local_bh_{disable,enable} (which on + * have been measured add cost if 7.5ns on CPU E5-2695). + * + * MUST not be used from interrupt context, when relying on softirq usage. + */ +static inline int __qmempool_preempt_disable(void) +{ + int in_serving_softirq = in_serving_softirq(); + + if (!in_serving_softirq) + local_bh_disable(); + + return in_serving_softirq; +} + +static inline void __qmempool_preempt_enable(int in_serving_softirq) +{ + if (!in_serving_softirq) + local_bh_enable(); +} + +/* Elements - alloc and free functions are inlined here for + * performance reasons, as the per CPU lockless access should be as + * fast as possible. + */ + +/* Main allocation function + * + * Caller must make sure this is called from a preemptive safe context + */ +static inline void * main_qmempool_alloc(struct qmempool *pool, gfp_t gfp_mask) +{ + /* NUMA considerations, for now the numa node is not handles, + * this could be handled via e.g. numa_mem_id() + */ + void *elem; + struct qmempool_percpu *cpu; + int num; + + /* 1. attempt get element from local per CPU queue */ + cpu = this_cpu_ptr(pool->percpu); + num = alf_sc_dequeue(cpu->localq, (void **)&elem, 1); + if (num == 1) /* Succes: alloc elem by deq from localq cpu cache */ + return elem; + + /* 2. attempt get element from shared queue. This involves + * refilling the localq for next round. Side-effect can be + * alloc from SLAB. + */ + elem = __qmempool_alloc_from_sharedq(pool, gfp_mask, cpu->localq); + return elem; +} + +static inline void *__qmempool_alloc(struct qmempool *pool, gfp_t gfp_mask) +{ + void *elem; + int state; + + state = __qmempool_preempt_disable(); + elem = main_qmempool_alloc(pool, gfp_mask); + __qmempool_preempt_enable(state); + return elem; +} + +static inline void *__qmempool_alloc_softirq(struct qmempool *pool, + gfp_t gfp_mask) +{ + return main_qmempool_alloc(pool, gfp_mask); +} + +/* Main free function */ +static inline void __qmempool_free(struct qmempool *pool, void *elem) +{ + struct qmempool_percpu *cpu; + int num; + int state; + + /* NUMA considerations, how do we make sure to avoid caching + * elements from a different NUMA node. + */ + state = __qmempool_preempt_disable(); + + /* 1. attempt to free/return element to local per CPU queue */ + cpu = this_cpu_ptr(pool->percpu); + num = alf_sp_enqueue(cpu->localq, &elem, 1); + if (num == 1) /* success: element free'ed by enqueue to localq */ + goto done; + + /* 2. localq cannot store more elements, need to return some + * from localq to sharedq, to make room. Side-effect can be + * free to SLAB. + */ + __qmempool_free_to_sharedq(elem, pool, cpu->localq); + +done: + __qmempool_preempt_enable(state); +} + +/* API users can choose to use "__" prefixed versions for inlining */ +extern void *qmempool_alloc(struct qmempool *pool, gfp_t gfp_mask); +extern void *qmempool_alloc_softirq(struct qmempool *pool, gfp_t gfp_mask); +extern void qmempool_free(struct qmempool *pool, void *elem); + +#endif /* _LINUX_QMEMPOOL_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 1d1ae6b..abaa94c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -618,3 +618,15 @@ config MAX_STACK_SIZE_MB changed to a smaller value in which case that is used. A sane initial value is 80 MB. + +config QMEMPOOL + bool "Quick queue based mempool (qmempool)" + default y + select ALF_QUEUE + help + A mempool designed for faster than SLAB/kmem_cache + reuse/caching of fixed size memory elements. Works as a + caching layer in-front of existing kmem_cache SLABs. Speed + is achieved by _bulk_ refilling percpu local cache, from a + Lock-Free queue requiring a single (locked) cmpxchg per bulk + transfer, thus amortizing the cost of the cmpxchg. diff --git a/mm/Makefile b/mm/Makefile index 8405eb0..49c1e18 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -69,3 +69,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o +obj-$(CONFIG_QMEMPOOL) += qmempool.o diff --git a/mm/qmempool.c b/mm/qmempool.c new file mode 100644 index 0000000..d6debcc --- /dev/null +++ b/mm/qmempool.c @@ -0,0 +1,322 @@ +/* + * qmempool - a quick queue based mempool + * + * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer + * for licensing details see kernel-base/COPYING + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/percpu.h> +#include <linux/qmempool.h> +#include <linux/log2.h> + +/* Due to hotplug CPU support, we need access to all qmempools + * in-order to cleanup elements in localq for the CPU going offline. + * + * TODO: implement HOTPLUG_CPU +#ifdef CONFIG_HOTPLUG_CPU +static LIST_HEAD(qmempool_list); +static DEFINE_SPINLOCK(qmempool_list_lock); +#endif + */ + +void qmempool_destroy(struct qmempool *pool) +{ + void *elem = NULL; + int j; + + if (pool->percpu) { + for_each_possible_cpu(j) { + struct qmempool_percpu *cpu = + per_cpu_ptr(pool->percpu, j); + + while (alf_mc_dequeue(cpu->localq, &elem, 1) == 1) + kmem_cache_free(pool->kmem, elem); + BUG_ON(!alf_queue_empty(cpu->localq)); + alf_queue_free(cpu->localq); + } + free_percpu(pool->percpu); + } + + if (pool->sharedq) { + while (alf_mc_dequeue(pool->sharedq, &elem, 1) == 1) + kmem_cache_free(pool->kmem, elem); + BUG_ON(!alf_queue_empty(pool->sharedq)); + alf_queue_free(pool->sharedq); + } + + kfree(pool); +} +EXPORT_SYMBOL(qmempool_destroy); + +struct qmempool * +qmempool_create(uint32_t localq_sz, uint32_t sharedq_sz, uint32_t prealloc, + struct kmem_cache *kmem, gfp_t gfp_mask) +{ + struct qmempool *pool; + int i, j, num; + void *elem; + + /* Validate constraints, e.g. due to bulking */ + if (localq_sz < QMEMPOOL_BULK) { + pr_err("%s() localq size(%d) too small for bulking\n", + __func__, localq_sz); + return NULL; + } + if (sharedq_sz < (QMEMPOOL_BULK * QMEMPOOL_REFILL_MULTIPLIER)) { + pr_err("%s() sharedq size(%d) too small for bulk refill\n", + __func__, sharedq_sz); + return NULL; + } + if (!is_power_of_2(localq_sz) || !is_power_of_2(sharedq_sz)) { + pr_err("%s() queue sizes (%d/%d) must be power-of-2\n", + __func__, localq_sz, sharedq_sz); + return NULL; + } + if (prealloc > sharedq_sz) { + pr_err("%s() prealloc(%d) req > sharedq size(%d)\n", + __func__, prealloc, sharedq_sz); + return NULL; + } + if ((prealloc % QMEMPOOL_BULK) != 0) { + pr_warn("%s() prealloc(%d) should be div by BULK size(%d)\n", + __func__, prealloc, QMEMPOOL_BULK); + } + if (!kmem) { + pr_err("%s() kmem_cache is a NULL ptr\n", __func__); + return NULL; + } + + pool = kzalloc(sizeof(*pool), gfp_mask); + if (!pool) + return NULL; + pool->kmem = kmem; + pool->gfp_mask = gfp_mask; + + /* MPMC (Multi-Producer-Multi-Consumer) queue */ + pool->sharedq = alf_queue_alloc(sharedq_sz, gfp_mask); + if (IS_ERR_OR_NULL(pool->sharedq)) { + pr_err("%s() failed to create shared queue(%d) ERR_PTR:0x%p\n", + __func__, sharedq_sz, pool->sharedq); + qmempool_destroy(pool); + return NULL; + } + + pool->prealloc = prealloc; + for (i = 0; i < prealloc; i++) { + elem = kmem_cache_alloc(pool->kmem, gfp_mask); + if (!elem) { + pr_err("%s() kmem_cache out of memory?!\n", __func__); + qmempool_destroy(pool); + return NULL; + } + /* Could use the SP version given it is not visible yet */ + num = alf_mp_enqueue(pool->sharedq, &elem, 1); + BUG_ON(num <= 0); + } + + pool->percpu = alloc_percpu(struct qmempool_percpu); + if (pool->percpu == NULL) { + pr_err("%s() failed to alloc percpu\n", __func__); + qmempool_destroy(pool); + return NULL; + } + + /* SPSC (Single-Consumer-Single-Producer) queue per CPU */ + for_each_possible_cpu(j) { + struct qmempool_percpu *cpu = per_cpu_ptr(pool->percpu, j); + + cpu->localq = alf_queue_alloc(localq_sz, gfp_mask); + if (IS_ERR_OR_NULL(cpu->localq)) { + pr_err("%s() failed alloc localq(sz:%d) on cpu:%d\n", + __func__, localq_sz, j); + qmempool_destroy(pool); + return NULL; + } + } + + return pool; +} +EXPORT_SYMBOL(qmempool_create); + +/* Element handling + */ + +/* This function is called when sharedq runs-out of elements. + * Thus, sharedq needs to be refilled (enq) with elems from slab. + * + * Caller must assure this is called in an preemptive safe context due + * to alf_mp_enqueue() call. + */ +void *__qmempool_alloc_from_slab(struct qmempool *pool, gfp_t gfp_mask) +{ + void *elems[QMEMPOOL_BULK]; /* on stack variable */ + void *elem; + int num, i, j; + + /* Cannot use SLAB that can sleep if (gfp_mask & __GFP_WAIT), + * else preemption disable/enable scheme becomes too complicated + */ + BUG_ON(gfp_mask & __GFP_WAIT); + + elem = kmem_cache_alloc(pool->kmem, gfp_mask); + if (elem == NULL) /* slab depleted, no reason to call below allocs */ + return NULL; + + /* SLAB considerations, we need a kmem_cache interface that + * supports allocating a bulk of elements. + */ + + for (i = 0; i < QMEMPOOL_REFILL_MULTIPLIER; i++) { + for (j = 0; j < QMEMPOOL_BULK; j++) { + elems[j] = kmem_cache_alloc(pool->kmem, gfp_mask); + /* Handle if slab gives us NULL elem */ + if (elems[j] == NULL) { + pr_err("%s() ARGH - slab returned NULL", + __func__); + num = alf_mp_enqueue(pool->sharedq, elems, j-1); + BUG_ON(num == 0); //FIXME handle + return elem; + } + } + num = alf_mp_enqueue(pool->sharedq, elems, QMEMPOOL_BULK); + /* FIXME: There is a theoretical chance that multiple + * CPU enter here, refilling sharedq at the same time, + * thus we must handle "full" situation, for now die + * hard so someone will need to fix this. + */ + BUG_ON(num == 0); /* sharedq should have room */ + } + + /* What about refilling localq here? (else it will happen on + * next cycle, and will cost an extra cmpxchg). + */ + return elem; +} + +/* This function is called when the localq runs out-of elements. + * Thus, localq is refilled (enq) with elements (deq) from sharedq. + * + * Caller must assure this is called in an preemptive safe context due + * to alf_mp_dequeue() call. + */ +void *__qmempool_alloc_from_sharedq(struct qmempool *pool, gfp_t gfp_mask, + struct alf_queue *localq) +{ + void *elems[QMEMPOOL_BULK]; /* on stack variable */ + void *elem; + int num; + + /* Costs atomic "cmpxchg", but amortize cost by bulk dequeue */ + num = alf_mc_dequeue(pool->sharedq, elems, QMEMPOOL_BULK); + if (likely(num > 0)) { + /* Consider prefetching data part of elements here, it + * should be an optimal place to hide memory prefetching. + * Especially given the localq is known to be an empty FIFO + * which guarantees the order objs are accessed in. + */ + elem = elems[0]; /* extract one element */ + if (num > 1) { + num = alf_sp_enqueue(localq, &elems[1], num-1); + /* Refill localq, should be empty, must succeed */ + BUG_ON(num == 0); + } + return elem; + } + /* Use slab if sharedq runs out of elements */ + elem = __qmempool_alloc_from_slab(pool, gfp_mask); + return elem; +} +EXPORT_SYMBOL(__qmempool_alloc_from_sharedq); + +/* Called when sharedq is full. Thus also make room in sharedq, + * besides also freeing the "elems" given. + */ +bool __qmempool_free_to_slab(struct qmempool *pool, void **elems, int n) +{ + int num, i, j; + /* SLAB considerations, we could use kmem_cache interface that + * supports returning a bulk of elements. + */ + + /* free these elements for real */ + for (i = 0; i < n; i++) + kmem_cache_free(pool->kmem, elems[i]); + + /* Make room in sharedq for next round */ + for (i = 0; i < QMEMPOOL_REFILL_MULTIPLIER; i++) { + num = alf_mc_dequeue(pool->sharedq, elems, QMEMPOOL_BULK); + for (j = 0; j < num; j++) + kmem_cache_free(pool->kmem, elems[j]); + } + return true; +} + +/* This function is called when the localq is full. Thus, elements + * from localq needs to be (dequeued) and returned (enqueued) to + * sharedq (or if shared is full, need to be free'ed to slab) + * + * MUST be called from a preemptive safe context. + */ +void __qmempool_free_to_sharedq(void *elem, struct qmempool *pool, + struct alf_queue *localq) +{ + void *elems[QMEMPOOL_BULK]; /* on stack variable */ + int num_enq, num_deq; + + elems[0] = elem; + /* Make room in localq */ + num_deq = alf_sc_dequeue(localq, &elems[1], QMEMPOOL_BULK-1); + if (unlikely(num_deq == 0)) + goto failed; + num_deq++; /* count first 'elem' */ + + /* Successful dequeued 'num_deq' elements from localq, "free" + * these elems by enqueuing to sharedq + */ + num_enq = alf_mp_enqueue(pool->sharedq, elems, num_deq); + if (likely(num_enq == num_deq)) /* Success enqueued to sharedq */ + return; + + /* If sharedq is full (num_enq == 0) dequeue elements will be + * returned directly to the SLAB allocator. + * + * Note: This usage of alf_queue API depend on enqueue is + * fixed, by only enqueueing if all elements could fit, this + * is an API that might change. + */ + + __qmempool_free_to_slab(pool, elems, num_deq); + return; +failed: + /* dequeing from a full localq should always be possible */ + BUG(); +} +EXPORT_SYMBOL(__qmempool_free_to_sharedq); + +/* API users can choose to use "__" prefixed versions for inlining */ +void *qmempool_alloc(struct qmempool *pool, gfp_t gfp_mask) +{ + return __qmempool_alloc(pool, gfp_mask); +} +EXPORT_SYMBOL(qmempool_alloc); + +void *qmempool_alloc_softirq(struct qmempool *pool, gfp_t gfp_mask) +{ + return __qmempool_alloc_softirq(pool, gfp_mask); +} +EXPORT_SYMBOL(qmempool_alloc_softirq); + +void qmempool_free(struct qmempool *pool, void *elem) +{ + return __qmempool_free(pool, elem); +} +EXPORT_SYMBOL(qmempool_free); + +MODULE_DESCRIPTION("Quick queue based mempool (qmempool)"); +MODULE_AUTHOR("Jesper Dangaard Brouer <netoptimizer@xxxxxxxxxx>"); +MODULE_LICENSE("GPL"); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>