Re: [PATCH RFC v3 5/9] mm/slub: add opt-in percpu array cache of objects

Suren Baghdasaryan <surenb@xxxxxxxxxx> · Fri, 15 Dec 2023 13:17:58 -0800

On Fri, Dec 15, 2023 at 10:28 AM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote:
>
> On Wed, Nov 29, 2023 at 1:53 AM Vlastimil Babka <vbabka@xxxxxxx> wrote:
> >
> > kmem_cache_setup_percpu_array() will allocate a per-cpu array for
> > caching alloc/free objects of given size for the cache. The cache
> > has to be created with SLAB_NO_MERGE flag.
> >
> > When empty, half of the array is filled by an internal bulk alloc
> > operation. When full, half of the array is flushed by an internal bulk
> > free operation.
> >
> > The array does not distinguish NUMA locality of the cached objects. If
> > an allocation is requested with kmem_cache_alloc_node() with numa node
> > not equal to NUMA_NO_NODE, the array is bypassed.
> >
> > The bulk operations exposed to slab users also try to utilize the array
> > when possible, but leave the array empty or full and use the bulk
> > alloc/free only to finish the operation itself. If kmemcg is enabled and
> > active, bulk freeing skips the array completely as it would be less
> > efficient to use it.
> >
> > The locking scheme is copied from the page allocator's pcplists, based
> > on embedded spin locks. Interrupts are not disabled, only preemption
> > (cpu migration on RT). Trylock is attempted to avoid deadlock due to an
> > interrupt; trylock failure means the array is bypassed.
> >
> > Sysfs stat counters alloc_cpu_cache and free_cpu_cache count objects
> > allocated or freed using the percpu array; counters cpu_cache_refill and
> > cpu_cache_flush count objects refilled or flushed form the array.
> >
> > kmem_cache_prefill_percpu_array() can be called to ensure the array on
> > the current cpu to at least the given number of objects. However this is
> > only opportunistic as there's no cpu pinning between the prefill and
> > usage, and trylocks may fail when the usage is in an irq handler.
> > Therefore allocations cannot rely on the array for success even after
> > the prefill. But misses should be rare enough that e.g. GFP_ATOMIC
> > allocations should be acceptable after the refill.
> >
> > When slub_debug is enabled for a cache with percpu array, the objects in
> > the array are considered as allocated from the slub_debug perspective,
> > and the alloc/free debugging hooks occur when moving the objects between
> > the array and slab pages. This means that e.g. an use-after-free that
> > occurs for an object cached in the array is undetected. Collected
> > alloc/free stacktraces might also be less useful. This limitation could
> > be changed in the future.
> >
> > On the other hand, KASAN, kmemcg and other hooks are executed on actual
> > allocations and frees by kmem_cache users even if those use the array,
> > so their debugging or accounting accuracy should be unaffected.
> >
> > Signed-off-by: Vlastimil Babka <vbabka@xxxxxxx>
> > ---
> >  include/linux/slab.h     |   4 +
> >  include/linux/slub_def.h |  12 ++
> >  mm/Kconfig               |   1 +
> >  mm/slub.c                | 457 ++++++++++++++++++++++++++++++++++++++++++++++-
> >  4 files changed, 468 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/slab.h b/include/linux/slab.h
> > index d6d6ffeeb9a2..fe0c0981be59 100644
> > --- a/include/linux/slab.h
> > +++ b/include/linux/slab.h
> > @@ -197,6 +197,8 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
> >  void kmem_cache_destroy(struct kmem_cache *s);
> >  int kmem_cache_shrink(struct kmem_cache *s);
> >
> > +int kmem_cache_setup_percpu_array(struct kmem_cache *s, unsigned int count);
> > +
> >  /*
> >   * Please use this macro to create slab caches. Simply specify the
> >   * name of the structure and maybe some flags that are listed above.
> > @@ -512,6 +514,8 @@ void kmem_cache_free(struct kmem_cache *s, void *objp);
> >  void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
> >  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
> >
> > +int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count, gfp_t gfp);
> > +
> >  static __always_inline void kfree_bulk(size_t size, void **p)
> >  {
> >         kmem_cache_free_bulk(NULL, size, p);
> > diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
> > index deb90cf4bffb..2083aa849766 100644
> > --- a/include/linux/slub_def.h
> > +++ b/include/linux/slub_def.h
> > @@ -13,8 +13,10 @@
> >  #include <linux/local_lock.h>
> >
> >  enum stat_item {
> > +       ALLOC_PCA,              /* Allocation from percpu array cache */
> >         ALLOC_FASTPATH,         /* Allocation from cpu slab */
> >         ALLOC_SLOWPATH,         /* Allocation by getting a new cpu slab */
> > +       FREE_PCA,               /* Free to percpu array cache */
> >         FREE_FASTPATH,          /* Free to cpu slab */
> >         FREE_SLOWPATH,          /* Freeing not to cpu slab */
> >         FREE_FROZEN,            /* Freeing to frozen slab */
> > @@ -39,6 +41,8 @@ enum stat_item {
> >         CPU_PARTIAL_FREE,       /* Refill cpu partial on free */
> >         CPU_PARTIAL_NODE,       /* Refill cpu partial from node partial */
> >         CPU_PARTIAL_DRAIN,      /* Drain cpu partial to node partial */
> > +       PCA_REFILL,             /* Refilling empty percpu array cache */
> > +       PCA_FLUSH,              /* Flushing full percpu array cache */
> >         NR_SLUB_STAT_ITEMS
> >  };
> >
> > @@ -66,6 +70,13 @@ struct kmem_cache_cpu {
> >  };
> >  #endif /* CONFIG_SLUB_TINY */
> >
> > +struct slub_percpu_array {
> > +       spinlock_t lock;
> > +       unsigned int count;
> > +       unsigned int used;
> > +       void * objects[];
> > +};
> > +
> >  #ifdef CONFIG_SLUB_CPU_PARTIAL
> >  #define slub_percpu_partial(c)         ((c)->partial)
> >
> > @@ -99,6 +110,7 @@ struct kmem_cache {
> >  #ifndef CONFIG_SLUB_TINY
> >         struct kmem_cache_cpu __percpu *cpu_slab;
> >  #endif
> > +       struct slub_percpu_array __percpu *cpu_array;
> >         /* Used for retrieving partial slabs, etc. */
> >         slab_flags_t flags;
> >         unsigned long min_partial;
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index 89971a894b60..aa53c51bb4a6 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -237,6 +237,7 @@ choice
> >  config SLAB_DEPRECATED
> >         bool "SLAB (DEPRECATED)"
> >         depends on !PREEMPT_RT
> > +       depends on BROKEN
> >         help
> >           Deprecated and scheduled for removal in a few cycles. Replaced by
> >           SLUB.
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 59912a376c6d..f08bd71c244f 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -188,6 +188,79 @@ do {                                       \
> >  #define USE_LOCKLESS_FAST_PATH()       (false)
> >  #endif
> >
> > +/* copy/pasted  from mm/page_alloc.c */
> > +
> > +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
> > +/*
> > + * On SMP, spin_trylock is sufficient protection.
> > + * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
> > + */
> > +#define pcp_trylock_prepare(flags)     do { } while (0)
> > +#define pcp_trylock_finish(flag)       do { } while (0)
> > +#else
> > +
> > +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
> > +#define pcp_trylock_prepare(flags)     local_irq_save(flags)
> > +#define pcp_trylock_finish(flags)      local_irq_restore(flags)
> > +#endif
> > +
> > +/*
> > + * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
> > + * a migration causing the wrong PCP to be locked and remote memory being
> > + * potentially allocated, pin the task to the CPU for the lookup+lock.
> > + * preempt_disable is used on !RT because it is faster than migrate_disable.
> > + * migrate_disable is used on RT because otherwise RT spinlock usage is
> > + * interfered with and a high priority task cannot preempt the allocator.
> > + */
> > +#ifndef CONFIG_PREEMPT_RT
> > +#define pcpu_task_pin()                preempt_disable()
> > +#define pcpu_task_unpin()      preempt_enable()
> > +#else
> > +#define pcpu_task_pin()                migrate_disable()
> > +#define pcpu_task_unpin()      migrate_enable()
> > +#endif
> > +
> > +/*
> > + * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
> > + * Return value should be used with equivalent unlock helper.
> > + */
> > +#define pcpu_spin_lock(type, member, ptr)                              \
> > +({                                                                     \
> > +       type *_ret;                                                     \
> > +       pcpu_task_pin();                                                \
> > +       _ret = this_cpu_ptr(ptr);                                       \
> > +       spin_lock(&_ret->member);                                       \
> > +       _ret;                                                           \
> > +})
> > +
> > +#define pcpu_spin_trylock(type, member, ptr)                           \
> > +({                                                                     \
> > +       type *_ret;                                                     \
> > +       pcpu_task_pin();                                                \
> > +       _ret = this_cpu_ptr(ptr);                                       \
> > +       if (!spin_trylock(&_ret->member)) {                             \
> > +               pcpu_task_unpin();                                      \
> > +               _ret = NULL;                                            \
> > +       }                                                               \
> > +       _ret;                                                           \
> > +})
> > +
> > +#define pcpu_spin_unlock(member, ptr)                                  \
> > +({                                                                     \
> > +       spin_unlock(&ptr->member);                                      \
> > +       pcpu_task_unpin();                                              \
> > +})
> > +
> > +/* struct slub_percpu_array specific helpers. */
> > +#define pca_spin_lock(ptr)                                             \
> > +       pcpu_spin_lock(struct slub_percpu_array, lock, ptr)
> > +
> > +#define pca_spin_trylock(ptr)                                          \
> > +       pcpu_spin_trylock(struct slub_percpu_array, lock, ptr)
> > +
> > +#define pca_spin_unlock(ptr)                                           \
> > +       pcpu_spin_unlock(lock, ptr)
> > +
> >  #ifndef CONFIG_SLUB_TINY
> >  #define __fastpath_inline __always_inline
> >  #else
> > @@ -3454,6 +3527,78 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
> >                         0, sizeof(void *));
> >  }
> >
> > +static bool refill_pca(struct kmem_cache *s, unsigned int count, gfp_t gfp);
> > +
> > +static __fastpath_inline
> > +void *alloc_from_pca(struct kmem_cache *s, gfp_t gfp)
> > +{
> > +       unsigned long __maybe_unused UP_flags;
> > +       struct slub_percpu_array *pca;
> > +       void *object;
> > +
> > +retry:
> > +       pcp_trylock_prepare(UP_flags);
> > +       pca = pca_spin_trylock(s->cpu_array);
> > +
> > +       if (unlikely(!pca)) {
> > +               pcp_trylock_finish(UP_flags);
> > +               return NULL;
> > +       }
> > +
> > +       if (unlikely(pca->used == 0)) {
> > +               unsigned int batch = pca->count / 2;
> > +
> > +               pca_spin_unlock(pca);
> > +               pcp_trylock_finish(UP_flags);
> > +
> > +               if (!gfpflags_allow_blocking(gfp) || in_irq())
> > +                       return NULL;
> > +
> > +               if (refill_pca(s, batch, gfp))
> > +                       goto retry;
> > +
> > +               return NULL;
> > +       }
> > +
> > +       object = pca->objects[--pca->used];
> > +
> > +       pca_spin_unlock(pca);
> > +       pcp_trylock_finish(UP_flags);
> > +
> > +       stat(s, ALLOC_PCA);
> > +
> > +       return object;
> > +}
> > +
> > +static __fastpath_inline
> > +int alloc_from_pca_bulk(struct kmem_cache *s, size_t size, void **p)
> > +{
> > +       unsigned long __maybe_unused UP_flags;
> > +       struct slub_percpu_array *pca;
> > +
> > +       pcp_trylock_prepare(UP_flags);
> > +       pca = pca_spin_trylock(s->cpu_array);
> > +
> > +       if (unlikely(!pca)) {
> > +               size = 0;
> > +               goto failed;
> > +       }
> > +
> > +       if (pca->used < size)
> > +               size = pca->used;
> > +
> > +       for (int i = size; i > 0;) {
> > +               p[--i] = pca->objects[--pca->used];
> > +       }
> > +
> > +       pca_spin_unlock(pca);
> > +       stat_add(s, ALLOC_PCA, size);
> > +
> > +failed:
> > +       pcp_trylock_finish(UP_flags);
> > +       return size;
> > +}
> > +
> >  /*
> >   * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
> >   * have the fastpath folded into their functions. So no function call
> > @@ -3479,7 +3624,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
> >         if (unlikely(object))
> >                 goto out;
> >
> > -       object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
> > +       if (s->cpu_array && (node == NUMA_NO_NODE))
> > +               object = alloc_from_pca(s, gfpflags);
> > +
> > +       if (!object)
> > +               object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
> >
> >         maybe_wipe_obj_freeptr(s, object);
> >         init = slab_want_init_on_alloc(gfpflags, s);
> > @@ -3726,6 +3875,81 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> >         discard_slab(s, slab);
> >  }
> >
> > +static bool flush_pca(struct kmem_cache *s, unsigned int count);
> > +
> > +static __fastpath_inline
> > +bool free_to_pca(struct kmem_cache *s, void *object)
> > +{
> > +       unsigned long __maybe_unused UP_flags;
> > +       struct slub_percpu_array *pca;
> > +
> > +retry:
> > +       pcp_trylock_prepare(UP_flags);
> > +       pca = pca_spin_trylock(s->cpu_array);
> > +
> > +       if (!pca) {
> > +               pcp_trylock_finish(UP_flags);
> > +               return false;
> > +       }
> > +
> > +       if (pca->used == pca->count) {
> > +               unsigned int batch = pca->count / 2;
> > +
> > +               pca_spin_unlock(pca);
> > +               pcp_trylock_finish(UP_flags);
> > +
> > +               if (in_irq())
> > +                       return false;
> > +
> > +               if (!flush_pca(s, batch))
> > +                       return false;
> > +
> > +               goto retry;
> > +       }
> > +
> > +       pca->objects[pca->used++] = object;
> > +
> > +       pca_spin_unlock(pca);
> > +       pcp_trylock_finish(UP_flags);
> > +
> > +       stat(s, FREE_PCA);
> > +
> > +       return true;
> > +}
> > +
> > +static __fastpath_inline
> > +size_t free_to_pca_bulk(struct kmem_cache *s, size_t size, void **p)
> > +{
> > +       unsigned long __maybe_unused UP_flags;
> > +       struct slub_percpu_array *pca;
> > +       bool init;
> > +
> > +       pcp_trylock_prepare(UP_flags);
> > +       pca = pca_spin_trylock(s->cpu_array);
> > +
> > +       if (unlikely(!pca)) {
> > +               size = 0;
> > +               goto failed;
> > +       }
> > +
> > +       if (pca->count - pca->used < size)
> > +               size = pca->count - pca->used;
> > +
> > +       init = slab_want_init_on_free(s);
> > +
> > +       for (size_t i = 0; i < size; i++) {
> > +               if (likely(slab_free_hook(s, p[i], init)))
> > +                       pca->objects[pca->used++] = p[i];
> > +       }
> > +
> > +       pca_spin_unlock(pca);
> > +       stat_add(s, FREE_PCA, size);
> > +
> > +failed:
> > +       pcp_trylock_finish(UP_flags);
> > +       return size;
> > +}
> > +
> >  #ifndef CONFIG_SLUB_TINY
> >  /*
> >   * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
> > @@ -3811,7 +4035,12 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
> >  {
> >         memcg_slab_free_hook(s, slab, &object, 1);
> >
> > -       if (likely(slab_free_hook(s, object, slab_want_init_on_free(s))))
> > +       if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s))))
> > +               return;
> > +
> > +       if (s->cpu_array)
> > +               free_to_pca(s, object);
>
> free_to_pca() can return false and leave the object alive. I think you
> need to handle the failure case here to avoid leaks.
>
> > +       else
> >                 do_slab_free(s, slab, object, object, 1, addr);
> >  }
> >
> > @@ -3956,6 +4185,26 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
> >         if (!size)
> >                 return;
> >
> > +       /*
> > +        * In case the objects might need memcg_slab_free_hook(), skip the array
> > +        * because the hook is not effective with single objects and benefits
> > +        * from groups of objects from a single slab that the detached freelist
> > +        * builds. But once we build the detached freelist, it's wasteful to
> > +        * throw it away and put the objects into the array.
> > +        *
> > +        * XXX: This test could be cache-specific if it was not possible to use
> > +        * __GFP_ACCOUNT with caches that are not SLAB_ACCOUNT
> > +        */
> > +       if (s && s->cpu_array && !memcg_kmem_online()) {
> > +               size_t pca_freed = free_to_pca_bulk(s, size, p);
> > +
> > +               if (pca_freed == size)
> > +                       return;
> > +
> > +               p += pca_freed;
> > +               size -= pca_freed;
> > +       }
> > +
> >         do {
> >                 struct detached_freelist df;
> >
> > @@ -4073,7 +4322,8 @@ static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
> >  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
> >                           void **p)
> >  {
> > -       int i;
> > +       int from_pca = 0;
> > +       int allocated = 0;
> >         struct obj_cgroup *objcg = NULL;
> >
> >         if (!size)
> > @@ -4084,19 +4334,147 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
> >         if (unlikely(!s))
> >                 return 0;
> >
> > -       i = __kmem_cache_alloc_bulk(s, flags, size, p);
> > +       if (s->cpu_array)
> > +               from_pca = alloc_from_pca_bulk(s, size, p);
> > +
> > +       if (from_pca < size) {
> > +               allocated = __kmem_cache_alloc_bulk(s, flags, size-from_pca,
> > +                                                   p+from_pca);
> > +               if (allocated == 0 && from_pca > 0) {
> > +                       __kmem_cache_free_bulk(s, from_pca, p);
> > +               }
> > +       }
> > +
> > +       allocated += from_pca;
> >
> >         /*
> >          * memcg and kmem_cache debug support and memory initialization.
> >          * Done outside of the IRQ disabled fastpath loop.
> >          */
> > -       if (i != 0)
> > +       if (allocated != 0)
> >                 slab_post_alloc_hook(s, objcg, flags, size, p,
> >                         slab_want_init_on_alloc(flags, s), s->object_size);
> > -       return i;
> > +       return allocated;
> >  }
> >  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
> >
> > +static bool refill_pca(struct kmem_cache *s, unsigned int count, gfp_t gfp)
> > +{
> > +       void *objects[32];
> > +       unsigned int batch, allocated;
> > +       unsigned long __maybe_unused UP_flags;
> > +       struct slub_percpu_array *pca;
> > +
> > +bulk_alloc:
> > +       batch = min(count, 32U);
>
> Do you cap each batch at 32 to avoid overshooting too much (same in
> flush_pca())? If so, it would be good to have a comment here. Also,
> maybe this hardcoded 32 should be a function of pca->count instead? If
> we set up a pca array with pca->count larger than 64 then the refill
> count of pca->count/2 will always end up higher than 32, so at the end
> we will have to loop back (goto bulk_alloc) to allocate more objects.

Ah, I just noticed that you are using objects[32] and that's forcing
this limitation. Please ignore my previous comment.

>
> > +
> > +       allocated = __kmem_cache_alloc_bulk(s, gfp, batch, &objects[0]);
> > +       if (!allocated)
> > +               return false;
> > +
> > +       pcp_trylock_prepare(UP_flags);
> > +       pca = pca_spin_trylock(s->cpu_array);
> > +       if (!pca) {
> > +               pcp_trylock_finish(UP_flags);
> > +               return false;
> > +       }
> > +
> > +       batch = min(allocated, pca->count - pca->used);
> > +
> > +       for (unsigned int i = 0; i < batch; i++) {
> > +               pca->objects[pca->used++] = objects[i];
> > +       }
> > +
> > +       pca_spin_unlock(pca);
> > +       pcp_trylock_finish(UP_flags);
> > +
> > +       stat_add(s, PCA_REFILL, batch);
> > +
> > +       /*
> > +        * We could have migrated to a different cpu or somebody else freed to the
> > +        * pca while we were bulk allocating, and now we have too many objects
> > +        */
> > +       if (batch < allocated) {
> > +               __kmem_cache_free_bulk(s, allocated - batch, &objects[batch]);
> > +       } else {
> > +               count -= batch;
> > +               if (count > 0)
> > +                       goto bulk_alloc;
> > +       }
> > +
> > +       return true;
> > +}
> > +
> > +static bool flush_pca(struct kmem_cache *s, unsigned int count)
> > +{
> > +       void *objects[32];
> > +       unsigned int batch, remaining;
> > +       unsigned long __maybe_unused UP_flags;
> > +       struct slub_percpu_array *pca;
> > +
> > +next_batch:
> > +       batch = min(count, 32);
> > +
> > +       pcp_trylock_prepare(UP_flags);
> > +       pca = pca_spin_trylock(s->cpu_array);
> > +       if (!pca) {
> > +               pcp_trylock_finish(UP_flags);
> > +               return false;
> > +       }
> > +
> > +       batch = min(batch, pca->used);
> > +
> > +       for (unsigned int i = 0; i < batch; i++) {
> > +               objects[i] = pca->objects[--pca->used];
> > +       }
> > +
> > +       remaining = pca->used;
> > +
> > +       pca_spin_unlock(pca);
> > +       pcp_trylock_finish(UP_flags);
> > +
> > +       __kmem_cache_free_bulk(s, batch, &objects[0]);
> > +
> > +       stat_add(s, PCA_FLUSH, batch);
> > +
> > +       if (batch < count && remaining > 0) {
> > +               count -= batch;
> > +               goto next_batch;
> > +       }
> > +
> > +       return true;
> > +}
> > +
> > +/* Do not call from irq handler nor with irqs disabled */
> > +int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count,
> > +                                   gfp_t gfp)
> > +{
> > +       struct slub_percpu_array *pca;
> > +       unsigned int used;
> > +
> > +       lockdep_assert_no_hardirq();
> > +
> > +       if (!s->cpu_array)
> > +               return -EINVAL;
> > +
> > +       /* racy but we don't care */
> > +       pca = raw_cpu_ptr(s->cpu_array);
> > +
> > +       used = READ_ONCE(pca->used);
> > +
> > +       if (used >= count)
> > +               return 0;
> > +
> > +       if (pca->count < count)
> > +               return -EINVAL;
> > +
> > +       count -= used;
> > +
> > +       if (!refill_pca(s, count, gfp))
> > +               return -ENOMEM;
> > +
> > +       return 0;
> > +}
> >
> >  /*
> >   * Object placement in a slab is made very easy because we always start at
> > @@ -5167,6 +5545,65 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
> >         return 0;
> >  }
> >
> > +/**
> > + * kmem_cache_setup_percpu_array - Create a per-cpu array cache for the cache
> > + * @s: The cache to add per-cpu array. Must be created with SLAB_NO_MERGE flag.
> > + * @count: Size of the per-cpu array.
> > + *
> > + * After this call, allocations from the cache go through a percpu array. When
> > + * it becomes empty, half is refilled with a bulk allocation. When it becomes
> > + * full, half is flushed with a bulk free operation.
> > + *
> > + * Using the array cache is not guaranteed, i.e. it can be bypassed if its lock
> > + * cannot be obtained. The array cache also does not distinguish NUMA nodes, so
> > + * allocations via kmem_cache_alloc_node() with a node specified other than
> > + * NUMA_NO_NODE will bypass the cache.
> > + *
> > + * Bulk allocation and free operations also try to use the array.
> > + *
> > + * kmem_cache_prefill_percpu_array() can be used to pre-fill the array cache
> > + * before e.g. entering a restricted context. It is however not guaranteed that
> > + * the caller will be able to subsequently consume the prefilled cache. Such
> > + * failures should be however sufficiently rare so after the prefill,
> > + * allocations using GFP_ATOMIC | __GFP_NOFAIL are acceptable for objects up to
> > + * the prefilled amount.
> > + *
> > + * Limitations: when slub_debug is enabled for the cache, all relevant actions
> > + * (i.e. poisoning, obtaining stacktraces) and checks happen when objects move
> > + * between the array cache and slab pages, which may result in e.g. not
> > + * detecting a use-after-free while the object is in the array cache, and the
> > + * stacktraces may be less useful.
> > + *
> > + * Return: 0 if OK, -EINVAL on caches without SLAB_NO_MERGE or with the array
> > + * already created, -ENOMEM when the per-cpu array creation fails.
> > + */
> > +int kmem_cache_setup_percpu_array(struct kmem_cache *s, unsigned int count)
> > +{
> > +       int cpu;
> > +
> > +       if (WARN_ON_ONCE(!(s->flags & SLAB_NO_MERGE)))
> > +               return -EINVAL;
> > +
> > +       if (s->cpu_array)
> > +               return -EINVAL;
> > +
> > +       s->cpu_array = __alloc_percpu(struct_size(s->cpu_array, objects, count),
> > +                                       sizeof(void *));
>
> Maybe I missed it, but where do you free s->cpu_array? I see
> __kmem_cache_release() freeing s->cpu_slab but s->cpu_array seems to
> be left alive...
>
> > +
> > +       if (!s->cpu_array)
> > +               return -ENOMEM;
> > +
> > +       for_each_possible_cpu(cpu) {
> > +               struct slub_percpu_array *pca = per_cpu_ptr(s->cpu_array, cpu);
> > +
> > +               spin_lock_init(&pca->lock);
> > +               pca->count = count;
> > +               pca->used = 0;
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> >  #ifdef SLAB_SUPPORTS_SYSFS
> >  static int count_inuse(struct slab *slab)
> >  {
> > @@ -5944,8 +6381,10 @@ static ssize_t text##_store(struct kmem_cache *s,                \
> >  }                                                              \
> >  SLAB_ATTR(text);                                               \
> >
> > +STAT_ATTR(ALLOC_PCA, alloc_cpu_cache);
> >  STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
> >  STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
> > +STAT_ATTR(FREE_PCA, free_cpu_cache);
> >  STAT_ATTR(FREE_FASTPATH, free_fastpath);
> >  STAT_ATTR(FREE_SLOWPATH, free_slowpath);
> >  STAT_ATTR(FREE_FROZEN, free_frozen);
> > @@ -5970,6 +6409,8 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
> >  STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
> >  STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
> >  STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
> > +STAT_ATTR(PCA_REFILL, cpu_cache_refill);
> > +STAT_ATTR(PCA_FLUSH, cpu_cache_flush);
> >  #endif /* CONFIG_SLUB_STATS */
> >
> >  #ifdef CONFIG_KFENCE
> > @@ -6031,8 +6472,10 @@ static struct attribute *slab_attrs[] = {
> >         &remote_node_defrag_ratio_attr.attr,
> >  #endif
> >  #ifdef CONFIG_SLUB_STATS
> > +       &alloc_cpu_cache_attr.attr,
> >         &alloc_fastpath_attr.attr,
> >         &alloc_slowpath_attr.attr,
> > +       &free_cpu_cache_attr.attr,
> >         &free_fastpath_attr.attr,
> >         &free_slowpath_attr.attr,
> >         &free_frozen_attr.attr,
> > @@ -6057,6 +6500,8 @@ static struct attribute *slab_attrs[] = {
> >         &cpu_partial_free_attr.attr,
> >         &cpu_partial_node_attr.attr,
> >         &cpu_partial_drain_attr.attr,
> > +       &cpu_cache_refill_attr.attr,
> > +       &cpu_cache_flush_attr.attr,
> >  #endif
> >  #ifdef CONFIG_FAILSLAB
> >         &failslab_attr.attr,
> >
> > --
> > 2.43.0
> >
> >