On Mon, 10 Aug 2020, Xunlei Pang wrote: > > diff --git a/mm/slab.h b/mm/slab.h > index c85e2fa..a709a70 100644 > --- a/mm/slab.h > +++ b/mm/slab.h > @@ -616,7 +616,7 @@ struct kmem_cache_node { > #ifdef CONFIG_SLUB > unsigned long nr_partial; > struct list_head partial; > - atomic_long_t partial_free_objs; > + atomic_long_t __percpu *partial_free_objs; A percpu counter is never atomic. Just use unsigned long and use this_cpu operations for this thing. That should cut down further on the overhead. > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -1775,11 +1775,21 @@ static void discard_slab(struct kmem_cache *s, struct page *page) > /* > * Management of partially allocated slabs. > */ > +static inline long get_partial_free(struct kmem_cache_node *n) > +{ > + long nr = 0; > + int cpu; > + > + for_each_possible_cpu(cpu) > + nr += atomic_long_read(per_cpu_ptr(n->partial_free_objs, cpu)); this_cpu_read(*n->partial_free_objs) > static inline void > __update_partial_free(struct kmem_cache_node *n, long delta) > { > - atomic_long_add(delta, &n->partial_free_objs); > + atomic_long_add(delta, this_cpu_ptr(n->partial_free_objs)); this_cpu_add() and so on.