On 3/2/21 5:14 PM, Christoph Lameter wrote: > On Mon, 10 Aug 2020, Xunlei Pang wrote: > >> >> diff --git a/mm/slab.h b/mm/slab.h >> index c85e2fa..a709a70 100644 >> --- a/mm/slab.h >> +++ b/mm/slab.h >> @@ -616,7 +616,7 @@ struct kmem_cache_node { >> #ifdef CONFIG_SLUB >> unsigned long nr_partial; >> struct list_head partial; >> - atomic_long_t partial_free_objs; >> + atomic_long_t __percpu *partial_free_objs; > > A percpu counter is never atomic. Just use unsigned long and use this_cpu > operations for this thing. That should cut down further on the overhead. > >> --- a/mm/slub.c >> +++ b/mm/slub.c >> @@ -1775,11 +1775,21 @@ static void discard_slab(struct kmem_cache *s, struct page *page) >> /* >> * Management of partially allocated slabs. >> */ >> +static inline long get_partial_free(struct kmem_cache_node *n) >> +{ >> + long nr = 0; >> + int cpu; >> + >> + for_each_possible_cpu(cpu) >> + nr += atomic_long_read(per_cpu_ptr(n->partial_free_objs, cpu)); > > this_cpu_read(*n->partial_free_objs) > >> static inline void >> __update_partial_free(struct kmem_cache_node *n, long delta) >> { >> - atomic_long_add(delta, &n->partial_free_objs); >> + atomic_long_add(delta, this_cpu_ptr(n->partial_free_objs)); > > this_cpu_add() > > and so on. > Thanks, I changed them both to use "unsigned long", and will send v3 out after our internal performance regression test passes.