On Fri, Sep 23, 2022 at 07:43:22PM +0800, Vlastimil Babka wrote: > On 9/13/22 08:54, Feng Tang wrote: [...] > > which means in 'kmalloc-4k' slab, there are 126 requests of > > 2240 bytes which got a 4KB space (wasting 1856 bytes each > > and 233856 bytes in total), from ixgbe_alloc_q_vector(). > > > > And when system starts some real workload like multiple docker > > instances, there could are more severe waste. > > > > [1]. https://lkml.org/lkml/2019/8/12/266 > > [2]. https://lore.kernel.org/lkml/2920df89-9975-5785-f79b-257d3052dfaf@xxxxxxxxxx/ > > > > [Thanks Hyeonggon for pointing out several bugs about sorting/format] > > [Thanks Vlastimil for suggesting way to reduce memory usage of > > orig_size and keep it only for kmalloc objects] > > > > Signed-off-by: Feng Tang <feng.tang@xxxxxxxxx> > > Reviewed-by: Hyeonggon Yoo <42.hyeyoo@xxxxxxxxx> > > Cc: Robin Murphy <robin.murphy@xxxxxxx> > > Cc: John Garry <john.garry@xxxxxxxxxx> > > Cc: Kefeng Wang <wangkefeng.wang@xxxxxxxxxx> > > Thanks. > Given that the merge window is nearing, and the rest of the series a) has > some changes suggested and b) could be hopefully done in a simpler way with > the proposed ksize() cleanup, I am picking just this patch now to slab.git > (and thus -next), with some small modifications: OK, and all the cleanup/improvments from you look good to me. Many thanks! For kasan and ksize() related patches, I'll keep monitoring and working on them. - Feng > ... > > > + > > +static unsigned int get_orig_size(struct kmem_cache *s, void *object) > > Made this inline for consistency. > > > +{ > > + void *p = kasan_reset_tag(object); > > + > > + if (!slub_debug_orig_size(s)) > > + return s->object_size; > > + > > + p += get_info_end(s); > > + p += sizeof(struct track) * 2; > > + > > + return *(unsigned int *)p; > > +} > > + > > static void slab_bug(struct kmem_cache *s, char *fmt, ...) > > { > > struct va_format vaf; > > @@ -844,6 +890,9 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) > > if (s->flags & SLAB_STORE_USER) > > off += 2 * sizeof(struct track); > > > > + if (slub_debug_orig_size(s)) > > + off += sizeof(unsigned int); > > + > > off += kasan_metadata_size(s); > > > > if (off != size_from_object(s)) > > @@ -977,7 +1026,8 @@ static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, > > * > > * A. Free pointer (if we cannot overwrite object on free) > > * B. Tracking data for SLAB_STORE_USER > > - * C. Padding to reach required alignment boundary or at minimum > > + * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) > > + * D. Padding to reach required alignment boundary or at minimum > > * one word if debugging is on to be able to detect writes > > * before the word boundary. > > * > > @@ -995,10 +1045,14 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) > > { > > unsigned long off = get_info_end(s); /* The end of info */ > > > > - if (s->flags & SLAB_STORE_USER) > > + if (s->flags & SLAB_STORE_USER) { > > /* We also have user information there */ > > off += 2 * sizeof(struct track); > > > > + if (s->flags & SLAB_KMALLOC) > > + off += sizeof(unsigned int); > > + } > > + > > off += kasan_metadata_size(s); > > > > if (size_from_object(s) == off) > > @@ -1293,7 +1347,7 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, > > } > > > > static noinline int alloc_debug_processing(struct kmem_cache *s, > > - struct slab *slab, void *object) > > + struct slab *slab, void *object, int orig_size) > > { > > if (s->flags & SLAB_CONSISTENCY_CHECKS) { > > if (!alloc_consistency_checks(s, slab, object)) > > @@ -1302,6 +1356,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, > > > > /* Success. Perform special debug activities for allocs */ > > trace(s, slab, object, 1); > > + set_orig_size(s, object, orig_size); > > init_object(s, object, SLUB_RED_ACTIVE); > > return 1; > > > > @@ -1570,7 +1625,10 @@ static inline > > void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} > > > > static inline int alloc_debug_processing(struct kmem_cache *s, > > - struct slab *slab, void *object) { return 0; } > > + struct slab *slab, void *object, int orig_size) { return 0; } > > + > > +static inline void set_orig_size(struct kmem_cache *s, > > + void *object, unsigned int orig_size) {} > > There's no caller (in this patch alone) for the !SLUB_DEBUG version, so removed. > > > static inline void free_debug_processing( > > struct kmem_cache *s, struct slab *slab, > > @@ -1999,7 +2057,7 @@ static inline void remove_partial(struct kmem_cache_node *n, > > * it to full list if it was the last free object. > > */ > > static void *alloc_single_from_partial(struct kmem_cache *s, > > - struct kmem_cache_node *n, struct slab *slab) > > + struct kmem_cache_node *n, struct slab *slab, int orig_size) > > { > > void *object; > > > > @@ -2009,7 +2067,7 @@ static void *alloc_single_from_partial(struct kmem_cache *s, > > slab->freelist = get_freepointer(s, object); > > slab->inuse++; > > > > - if (!alloc_debug_processing(s, slab, object)) { > > + if (!alloc_debug_processing(s, slab, object, orig_size)) { > > remove_partial(n, slab); > > return NULL; > > } > > @@ -2028,7 +2086,7 @@ static void *alloc_single_from_partial(struct kmem_cache *s, > > * and put the slab to the partial (or full) list. > > */ > > static void *alloc_single_from_new_slab(struct kmem_cache *s, > > - struct slab *slab) > > + struct slab *slab, int orig_size) > > { > > int nid = slab_nid(slab); > > struct kmem_cache_node *n = get_node(s, nid); > > @@ -2040,7 +2098,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, > > slab->freelist = get_freepointer(s, object); > > slab->inuse = 1; > > > > - if (!alloc_debug_processing(s, slab, object)) > > + if (!alloc_debug_processing(s, slab, object, orig_size)) > > /* > > * It's not really expected that this would fail on a > > * freshly allocated slab, but a concurrent memory > > @@ -2118,7 +2176,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); > > * Try to allocate a partial slab from a specific node. > > */ > > static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, > > - struct slab **ret_slab, gfp_t gfpflags) > > + struct partial_context *pc) > > { > > struct slab *slab, *slab2; > > void *object = NULL; > > @@ -2138,11 +2196,12 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, > > list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { > > void *t; > > > > - if (!pfmemalloc_match(slab, gfpflags)) > > + if (!pfmemalloc_match(slab, pc->flags)) > > continue; > > > > if (kmem_cache_debug(s)) { > > - object = alloc_single_from_partial(s, n, slab); > > + object = alloc_single_from_partial(s, n, slab, > > + pc->orig_size); > > if (object) > > break; > > continue; > > @@ -2153,7 +2212,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, > > break; > > > > if (!object) { > > - *ret_slab = slab; > > + *pc->slab = slab; > > stat(s, ALLOC_FROM_PARTIAL); > > object = t; > > } else { > > @@ -2177,14 +2236,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, > > /* > > * Get a slab from somewhere. Search in increasing NUMA distances. > > */ > > -static void *get_any_partial(struct kmem_cache *s, gfp_t flags, > > - struct slab **ret_slab) > > +static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) > > { > > #ifdef CONFIG_NUMA > > struct zonelist *zonelist; > > struct zoneref *z; > > struct zone *zone; > > - enum zone_type highest_zoneidx = gfp_zone(flags); > > + enum zone_type highest_zoneidx = gfp_zone(pc->flags); > > void *object; > > unsigned int cpuset_mems_cookie; > > > > @@ -2212,15 +2270,15 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, > > > > do { > > cpuset_mems_cookie = read_mems_allowed_begin(); > > - zonelist = node_zonelist(mempolicy_slab_node(), flags); > > + zonelist = node_zonelist(mempolicy_slab_node(), pc->flags); > > for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { > > struct kmem_cache_node *n; > > > > n = get_node(s, zone_to_nid(zone)); > > > > - if (n && cpuset_zone_allowed(zone, flags) && > > + if (n && cpuset_zone_allowed(zone, pc->flags) && > > n->nr_partial > s->min_partial) { > > - object = get_partial_node(s, n, ret_slab, flags); > > + object = get_partial_node(s, n, pc); > > if (object) { > > /* > > * Don't check read_mems_allowed_retry() > > @@ -2241,8 +2299,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, > > /* > > * Get a partial slab, lock it and return it. > > */ > > -static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, > > - struct slab **ret_slab) > > +static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc) > > { > > void *object; > > int searchnode = node; > > @@ -2250,11 +2307,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, > > if (node == NUMA_NO_NODE) > > searchnode = numa_mem_id(); > > > > - object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags); > > + object = get_partial_node(s, get_node(s, searchnode), pc); > > if (object || node != NUMA_NO_NODE) > > return object; > > > > - return get_any_partial(s, flags, ret_slab); > > + return get_any_partial(s, pc); > > } > > > > #ifdef CONFIG_PREEMPTION > > @@ -2974,11 +3031,12 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) > > * already disabled (which is the case for bulk allocation). > > */ > > static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > > - unsigned long addr, struct kmem_cache_cpu *c) > > + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) > > { > > void *freelist; > > struct slab *slab; > > unsigned long flags; > > + struct partial_context pc; > > > > stat(s, ALLOC_SLOWPATH); > > > > @@ -3092,7 +3150,10 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > > > > new_objects: > > > > - freelist = get_partial(s, gfpflags, node, &slab); > > + pc.flags = gfpflags; > > + pc.slab = &slab; > > + pc.orig_size = orig_size; > > + freelist = get_partial(s, node, &pc); > > if (freelist) > > goto check_new_slab; > > > > @@ -3108,7 +3169,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > > stat(s, ALLOC_SLAB); > > > > if (kmem_cache_debug(s)) { > > - freelist = alloc_single_from_new_slab(s, slab); > > + freelist = alloc_single_from_new_slab(s, slab, orig_size); > > > > if (unlikely(!freelist)) > > goto new_objects; > > @@ -3140,6 +3201,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > > */ > > if (s->flags & SLAB_STORE_USER) > > set_track(s, freelist, TRACK_ALLOC, addr); > > + > > return freelist; > > } > > > > @@ -3182,7 +3244,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > > * pointer. > > */ > > static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > > - unsigned long addr, struct kmem_cache_cpu *c) > > + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) > > { > > void *p; > > > > @@ -3195,7 +3257,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > > c = slub_get_cpu_ptr(s->cpu_slab); > > #endif > > > > - p = ___slab_alloc(s, gfpflags, node, addr, c); > > + p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); > > #ifdef CONFIG_PREEMPT_COUNT > > slub_put_cpu_ptr(s->cpu_slab); > > #endif > > @@ -3280,7 +3342,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_l > > > > if (!USE_LOCKLESS_FAST_PATH() || > > unlikely(!object || !slab || !node_match(slab, node))) { > > - object = __slab_alloc(s, gfpflags, node, addr, c); > > + object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); > > } else { > > void *next_object = get_freepointer_safe(s, object); > > > > @@ -3747,7 +3809,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, > > * of re-populating per CPU c->freelist > > */ > > p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, > > - _RET_IP_, c); > > + _RET_IP_, c, s->object_size); > > if (unlikely(!p[i])) > > goto error; > > > > @@ -4150,12 +4212,17 @@ static int calculate_sizes(struct kmem_cache *s) > > } > > > > #ifdef CONFIG_SLUB_DEBUG > > - if (flags & SLAB_STORE_USER) > > + if (flags & SLAB_STORE_USER) { > > /* > > * Need to store information about allocs and frees after > > * the object. > > */ > > size += 2 * sizeof(struct track); > > + > > + /* Save the original kmalloc request size */ > > + if (flags & SLAB_KMALLOC) > > + size += sizeof(unsigned int); > > + } > > #endif > > > > kasan_cache_create(s, &size, &s->flags); > > @@ -4770,7 +4837,7 @@ void __init kmem_cache_init(void) > > > > /* Now we can use the kmem_cache to allocate kmalloc slabs */ > > setup_kmalloc_cache_index_table(); > > - create_kmalloc_caches(0); > > + create_kmalloc_caches(SLAB_KMALLOC); > > Instead of this, add the flag in the common creation function, so SLAB kmalloc caches are also marked even if there's no use for it there now. > > --- a/mm/slab_common.c > +++ b/mm/slab_common.c > @@ -649,7 +649,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, > if (!s) > panic("Out of memory when creating slab %s\n", name); > > - create_boot_cache(s, name, size, flags, useroffset, usersize); > + create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset, > + usersize); > kasan_cache_create_kmalloc(s); > list_add(&s->list, &slab_caches); > s->refcount = 1; > > > > /* Setup random freelists for each cache */ > > init_freelist_randomization(); > > @@ -4937,6 +5004,7 @@ struct location { > > depot_stack_handle_t handle; > > unsigned long count; > > unsigned long addr; > > + unsigned long waste; > > long long sum_time; > > long min_time; > > long max_time; > > @@ -4983,13 +5051,15 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) > > } > > > > static int add_location(struct loc_track *t, struct kmem_cache *s, > > - const struct track *track) > > + const struct track *track, > > + unsigned int orig_size) > > { > > long start, end, pos; > > struct location *l; > > - unsigned long caddr, chandle; > > + unsigned long caddr, chandle, cwaste; > > unsigned long age = jiffies - track->when; > > depot_stack_handle_t handle = 0; > > + unsigned int waste = s->object_size - orig_size; > > > > #ifdef CONFIG_STACKDEPOT > > handle = READ_ONCE(track->handle); > > @@ -5007,11 +5077,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, > > if (pos == end) > > break; > > > > - caddr = t->loc[pos].addr; > > - chandle = t->loc[pos].handle; > > - if ((track->addr == caddr) && (handle == chandle)) { > > + l = &t->loc[pos]; > > + caddr = l->addr; > > + chandle = l->handle; > > + cwaste = l->waste; > > + if ((track->addr == caddr) && (handle == chandle) && > > + (waste == cwaste)) { > > > > - l = &t->loc[pos]; > > l->count++; > > if (track->when) { > > l->sum_time += age; > > @@ -5036,6 +5108,9 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, > > end = pos; > > else if (track->addr == caddr && handle < chandle) > > end = pos; > > + else if (track->addr == caddr && handle == chandle && > > + waste < cwaste) > > + end = pos; > > else > > start = pos; > > } > > @@ -5059,6 +5134,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, > > l->min_pid = track->pid; > > l->max_pid = track->pid; > > l->handle = handle; > > + l->waste = waste; > > cpumask_clear(to_cpumask(l->cpus)); > > cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); > > nodes_clear(l->nodes); > > @@ -5077,7 +5153,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, > > > > for_each_object(p, s, addr, slab->objects) > > if (!test_bit(__obj_to_index(s, addr, p), obj_map)) > > - add_location(t, s, get_track(s, p, alloc)); > > + add_location(t, s, get_track(s, p, alloc), get_orig_size(s, p)); > > I think it makes little sense to report waste in the 'free_traces' file? > So adjusted like this to make sure nothing is reported there: > > @@ -5356,13 +5353,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, > unsigned long *obj_map) > { > void *addr = slab_address(slab); > + bool is_alloc = (alloc == TRACK_ALLOC); > void *p; > > __fill_map(obj_map, s, slab); > > for_each_object(p, s, addr, slab->objects) > if (!test_bit(__obj_to_index(s, addr, p), obj_map)) > - add_location(t, s, get_track(s, p, alloc), get_orig_size(s, p)); > + add_location(t, s, get_track(s, p, alloc), > + is_alloc ? get_orig_size(s, p) : > + s->object_size); > > > > } > > #endif /* CONFIG_DEBUG_FS */ > > #endif /* CONFIG_SLUB_DEBUG */ > > @@ -5942,6 +6018,10 @@ static int slab_debugfs_show(struct seq_file *seq, void *v) > > else > > seq_puts(seq, "<not-available>"); > > > > + if (l->waste) > > + seq_printf(seq, " waste=%lu/%lu", > > + l->count * l->waste, l->waste); > > + > > if (l->sum_time != l->min_time) { > > seq_printf(seq, " age=%ld/%llu/%ld", > > l->min_time, div_u64(l->sum_time, l->count), > >