Hi, On Fri, Aug 11, 2023 at 1:36 AM Vlastimil Babka <vbabka@xxxxxxx> wrote: > > kmem_cache_setup_percpu_array() will allocate a per-cpu array for > caching alloc/free objects of given size for the cache. The cache > has to be created with SLAB_NO_MERGE flag. > > The array is filled by freeing. When empty for alloc or full for > freeing, it's simply bypassed by the operation, there's currently no > batch freeing/allocations. > > The locking is copied from the page allocator's pcplists, based on > embedded spin locks. Interrupts are not disabled, only preemption (cpu > migration on RT). Trylock is attempted to avoid deadlock due to > an intnerrupt, trylock failure means the array is bypassed. nit: s/intnerrupt/interrupt/ > /* > * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) > * have the fastpath folded into their functions. So no function call > @@ -3465,7 +3564,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list > if (unlikely(object)) > goto out; > > - object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); > + if (s->cpu_array) > + object = alloc_from_pca(s); > + > + if (!object) > + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); > > maybe_wipe_obj_freeptr(s, object); > init = slab_want_init_on_alloc(gfpflags, s); > @@ -3715,6 +3818,34 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > discard_slab(s, slab); > } > #ifndef CONFIG_SLUB_TINY > /* > * Fastpath with forced inlining to produce a kfree and kmem_cache_free that > @@ -3740,6 +3871,11 @@ static __always_inline void do_slab_free(struct kmem_cache *s, > unsigned long tid; > void **freelist; > > + if (s->cpu_array && cnt == 1) { > + if (free_to_pca(s, head)) > + return; > + } > + > redo: > /* > * Determine the currently cpus per cpu slab. > @@ -3793,6 +3929,11 @@ static void do_slab_free(struct kmem_cache *s, > { > void *tail_obj = tail ? : head; > > + if (s->cpu_array && cnt == 1) { > + if (free_to_pca(s, head)) > + return; > + } > + > __slab_free(s, slab, head, tail_obj, cnt, addr); > } > #endif /* CONFIG_SLUB_TINY */ Is this functionality needed for SLUB_TINY? > @@ -4060,6 +4201,45 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, > } > EXPORT_SYMBOL(kmem_cache_alloc_bulk); > > +int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count, > + gfp_t gfp) > +{ > + struct slub_percpu_array *pca; > + void *objects[32]; > + unsigned int used; > + unsigned int allocated; > + > + if (!s->cpu_array) > + return -EINVAL; > + > + /* racy but we don't care */ > + pca = raw_cpu_ptr(s->cpu_array); > + > + used = READ_ONCE(pca->used); Hmm for the prefill to be meaningful, remote allocation should be possible, right? Otherwise it only prefills for the CPU that requested it. > + if (used >= count) > + return 0; > + > + if (pca->count < count) > + return -EINVAL; > + > + count -= used; > + > + /* TODO fix later */ > + if (count > 32) > + count = 32; > + > + for (int i = 0; i < count; i++) > + objects[i] = NULL; > + allocated = kmem_cache_alloc_bulk(s, gfp, count, &objects[0]); > + > + for (int i = 0; i < count; i++) { > + if (objects[i]) { > + kmem_cache_free(s, objects[i]); > + } > + } nit: why not for (int i = 0; i < allocated; i++) { kmem_cache_free(s, objects[i]); } and skip objects[i] = NULL > + return allocated; > +} And a question: Does SLUB still need to maintain per-cpu partial slab lists even when an opt-in percpu array is used?