On Thu, 15 Feb 2024, Jianfeng Wang wrote: > When reading "/proc/slabinfo", the kernel needs to report the number of > free objects for each kmem_cache. The current implementation relies on > count_partial() that counts the number of free objects by scanning each > kmem_cache_node's partial slab list and summing free objects from all > partial slabs in the list. This process must hold per kmem_cache_node > spinlock and disable IRQ. Consequently, it can block slab allocation > requests on other CPU cores and cause timeouts for network devices etc., > if the partial slab list is long. In production, even NMI watchdog can > be triggered because some slab caches have a long partial list: e.g., > for "buffer_head", the number of partial slabs was observed to be ~1M > in one kmem_cache_node. This problem was also observed by several > others [1-2] in the past. > > The fix is to maintain a counter of free objects for each kmem_cache. > Then, in get_slabinfo(), use the counter rather than count_partial() > when reporting the number of free objects for a slab cache. per-cpu > counter is used to minimize atomic or lock operation. > > Benchmark: run hackbench on a dual-socket 72-CPU bare metal machine > with 256 GB memory and Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.3 GHz. > The command is "hackbench 18 thread 20000". Each group gets 10 runs. > This seems particularly intrusive for the common path to optimize for reading of /proc/slabinfo, and that's shown in the benchmark result. Could you discuss the /proc/slabinfo usage model a bit? It's not clear if this is being continuously read, or whether even a single read in isolation is problematic. That said, optimizing for reading /proc/slabinfo at the cost of runtime performance degradation doesn't sound like the right trade-off. > Results: > - Mainline: > 21.0381 +- 0.0325 seconds time elapsed ( +- 0.15% ) > - Mainline w/ this patch: > 21.1878 +- 0.0239 seconds time elapsed ( +- 0.11% ) > > [1] https://lore.kernel.org/linux-mm/ > alpine.DEB.2.21.2003031602460.1537@xxxxxxxxxxxxxxx/T/ > [2] https://lore.kernel.org/lkml/ > alpine.DEB.2.22.394.2008071258020.55871@xxxxxxxxxxxxxxx/T/ > > Signed-off-by: Jianfeng Wang <jianfeng.w.wang@xxxxxxxxxx> > --- > mm/slab.h | 4 ++++ > mm/slub.c | 31 +++++++++++++++++++++++++++++-- > 2 files changed, 33 insertions(+), 2 deletions(-) > > diff --git a/mm/slab.h b/mm/slab.h > index 54deeb0428c6..a0e7672ba648 100644 > --- a/mm/slab.h > +++ b/mm/slab.h > @@ -11,6 +11,7 @@ > #include <linux/memcontrol.h> > #include <linux/kfence.h> > #include <linux/kasan.h> > +#include <linux/percpu_counter.h> > > /* > * Internal slab definitions > @@ -277,6 +278,9 @@ struct kmem_cache { > unsigned int red_left_pad; /* Left redzone padding size */ > const char *name; /* Name (only for display!) */ > struct list_head list; /* List of slab caches */ > +#ifdef CONFIG_SLUB_DEBUG > + struct percpu_counter free_objects; > +#endif > #ifdef CONFIG_SYSFS > struct kobject kobj; /* For sysfs */ > #endif > diff --git a/mm/slub.c b/mm/slub.c > index 2ef88bbf56a3..44f8ded96574 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -736,6 +736,12 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, > static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; > static DEFINE_SPINLOCK(object_map_lock); > > +static inline void > +__update_kmem_cache_free_objs(struct kmem_cache *s, s64 delta) > +{ > + percpu_counter_add_batch(&s->free_objects, delta, INT_MAX); > +} > + > static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, > struct slab *slab) > { > @@ -1829,6 +1835,9 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, > return flags | slub_debug_local; > } > #else /* !CONFIG_SLUB_DEBUG */ > +static inline void > +__update_kmem_cache_free_objs(struct kmem_cache *s, s64 delta) {} > + > static inline void setup_object_debug(struct kmem_cache *s, void *object) {} > static inline > void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} > @@ -2369,6 +2378,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) > slab->inuse = 0; > slab->frozen = 0; > > + __update_kmem_cache_free_objs(s, slab->objects); > account_slab(slab, oo_order(oo), s, flags); > > slab->slab_cache = s; > @@ -2445,6 +2455,7 @@ static void free_slab(struct kmem_cache *s, struct slab *slab) > call_rcu(&slab->rcu_head, rcu_free_slab); > else > __free_slab(s, slab); > + __update_kmem_cache_free_objs(s, -slab->objects); > } > > static void discard_slab(struct kmem_cache *s, struct slab *slab) > @@ -3859,6 +3870,8 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list > */ > slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size); > > + if (object) > + __update_kmem_cache_free_objs(s, -1); > return object; > } > > @@ -4235,6 +4248,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s, > unsigned long tid; > void **freelist; > > + __update_kmem_cache_free_objs(s, cnt); > redo: > /* > * Determine the currently cpus per cpu slab. > @@ -4286,6 +4300,7 @@ static void do_slab_free(struct kmem_cache *s, > struct slab *slab, void *head, void *tail, > int cnt, unsigned long addr) > { > + __update_kmem_cache_free_objs(s, cnt); > __slab_free(s, slab, head, tail, cnt, addr); > } > #endif /* CONFIG_SLUB_TINY */ > @@ -4658,6 +4673,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, > memcg_slab_alloc_error_hook(s, size, objcg); > } > > + __update_kmem_cache_free_objs(s, -i); > return i; > } > EXPORT_SYMBOL(kmem_cache_alloc_bulk); > @@ -4899,6 +4915,9 @@ void __kmem_cache_release(struct kmem_cache *s) > cache_random_seq_destroy(s); > #ifndef CONFIG_SLUB_TINY > free_percpu(s->cpu_slab); > +#endif > +#ifdef CONFIG_SLUB_DEBUG > + percpu_counter_destroy(&s->free_objects); > #endif > free_kmem_cache_nodes(s); > } > @@ -5109,6 +5128,14 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) > s->random = get_random_long(); > #endif > > +#ifdef CONFIG_SLUB_DEBUG > + int ret; > + > + ret = percpu_counter_init(&s->free_objects, 0, GFP_KERNEL); > + if (ret) > + return ret; > +#endif > + > if (!calculate_sizes(s)) > goto error; > if (disable_higher_order_debug) { > @@ -7100,15 +7127,15 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) > { > unsigned long nr_slabs = 0; > unsigned long nr_objs = 0; > - unsigned long nr_free = 0; > + unsigned long nr_free; > int node; > struct kmem_cache_node *n; > > for_each_kmem_cache_node(s, node, n) { > nr_slabs += node_nr_slabs(n); > nr_objs += node_nr_objs(n); > - nr_free += count_partial(n, count_free); > } > + nr_free = percpu_counter_sum_positive(&s->free_objects); > > sinfo->active_objs = nr_objs - nr_free; > sinfo->num_objs = nr_objs; > -- > 2.42.1 > >