On Mon, Aug 26, 2024 at 06:04:13PM +0200, Christian Brauner wrote: > When a kmem cache is created with SLAB_TYPESAFE_BY_RCU the free pointer > must be located outside of the object because we don't know what part of > the memory can safely be overwritten as it may be needed to prevent > object recycling. > > That has the consequence that SLAB_TYPESAFE_BY_RCU may end up adding a > new cacheline. This is the case for .e.g, struct file. After having it > shrunk down by 40 bytes and having it fit in three cachelines we still > have SLAB_TYPESAFE_BY_RCU adding a fourth cacheline because it needs to > accomodate the free pointer and is hardware cacheline aligned. > > I tried to find ways to rectify this as struct file is pretty much > everywhere and having it use less memory is a good thing. So here's a > proposal that might be totally the wrong api and broken but I thought I > give it a try. > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> > --- > fs/file_table.c | 7 ++-- > include/linux/fs.h | 1 + > include/linux/slab.h | 4 +++ > mm/slab.h | 1 + > mm/slab_common.c | 76 +++++++++++++++++++++++++++++++++++++------- > mm/slub.c | 22 +++++++++---- > 6 files changed, 91 insertions(+), 20 deletions(-) > > diff --git a/fs/file_table.c b/fs/file_table.c > index 694199a1a966..a69b8a71eacb 100644 > --- a/fs/file_table.c > +++ b/fs/file_table.c > @@ -514,9 +514,10 @@ EXPORT_SYMBOL(__fput_sync); > > void __init files_init(void) > { > - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, > - SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | > - SLAB_PANIC | SLAB_ACCOUNT, NULL); > + filp_cachep = kmem_cache_create_rcu("filp", sizeof(struct file), > + offsetof(struct file, __f_slab_free_ptr), > + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, > + NULL); > percpu_counter_init(&nr_files, 0, GFP_KERNEL); > } > > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 61097a9cf317..de509f5d1446 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1057,6 +1057,7 @@ struct file { > struct callback_head f_task_work; > struct llist_node f_llist; > struct file_ra_state f_ra; > + void *__f_slab_free_ptr; > }; > /* --- cacheline 3 boundary (192 bytes) --- */ > } __randomize_layout > diff --git a/include/linux/slab.h b/include/linux/slab.h > index eb2bf4629157..fc3c3cc9f689 100644 > --- a/include/linux/slab.h > +++ b/include/linux/slab.h > @@ -242,6 +242,10 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, > slab_flags_t flags, > unsigned int useroffset, unsigned int usersize, > void (*ctor)(void *)); > +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, > + unsigned int offset, Just 'offset' is to vague, no? Maybe freeptr_offset? > + slab_flags_t flags, > + void (*ctor)(void *)); > void kmem_cache_destroy(struct kmem_cache *s); > int kmem_cache_shrink(struct kmem_cache *s); > > diff --git a/mm/slab.h b/mm/slab.h > index dcdb56b8e7f5..122ca41fea34 100644 > --- a/mm/slab.h > +++ b/mm/slab.h > @@ -261,6 +261,7 @@ struct kmem_cache { > unsigned int object_size; /* Object size without metadata */ > struct reciprocal_value reciprocal_size; > unsigned int offset; /* Free pointer offset */ > + bool dedicated_offset; /* Specific free pointer requested */ has_freeptr_offset? > #ifdef CONFIG_SLUB_CPU_PARTIAL > /* Number of per cpu partial objects to keep around */ > unsigned int cpu_partial; > diff --git a/mm/slab_common.c b/mm/slab_common.c > index 40b582a014b8..b6ca63859b3a 100644 > --- a/mm/slab_common.c > +++ b/mm/slab_common.c > @@ -202,10 +202,10 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, > } > > static struct kmem_cache *create_cache(const char *name, > - unsigned int object_size, unsigned int align, > - slab_flags_t flags, unsigned int useroffset, > - unsigned int usersize, void (*ctor)(void *), > - struct kmem_cache *root_cache) > + unsigned int object_size, unsigned int offset, > + unsigned int align, slab_flags_t flags, > + unsigned int useroffset, unsigned int usersize, > + void (*ctor)(void *), struct kmem_cache *root_cache) > { > struct kmem_cache *s; > int err; > @@ -213,6 +213,10 @@ static struct kmem_cache *create_cache(const char *name, > if (WARN_ON(useroffset + usersize > object_size)) > useroffset = usersize = 0; > > + if (WARN_ON(offset >= object_size || > + (offset && !(flags & SLAB_TYPESAFE_BY_RCU)))) > + offset = 0; > + > err = -ENOMEM; > s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); > if (!s) > @@ -226,6 +230,10 @@ static struct kmem_cache *create_cache(const char *name, > s->useroffset = useroffset; > s->usersize = usersize; > #endif > + if (offset > 0) { > + s->offset = offset; > + s->dedicated_offset = true; > + } > > err = __kmem_cache_create(s, flags); > if (err) > @@ -269,10 +277,10 @@ static struct kmem_cache *create_cache(const char *name, > * > * Return: a pointer to the cache on success, NULL on failure. > */ > -struct kmem_cache * > -kmem_cache_create_usercopy(const char *name, > - unsigned int size, unsigned int align, > - slab_flags_t flags, > +static struct kmem_cache * > +do_kmem_cache_create_usercopy(const char *name, > + unsigned int size, unsigned int offset, > + unsigned int align, slab_flags_t flags, > unsigned int useroffset, unsigned int usersize, > void (*ctor)(void *)) > { > @@ -332,7 +340,7 @@ kmem_cache_create_usercopy(const char *name, > goto out_unlock; > } > > - s = create_cache(cache_name, size, > + s = create_cache(cache_name, size, offset, > calculate_alignment(flags, align, size), > flags, useroffset, usersize, ctor, NULL); > if (IS_ERR(s)) { > @@ -356,6 +364,16 @@ kmem_cache_create_usercopy(const char *name, > } > return s; > } > + > +struct kmem_cache * > +kmem_cache_create_usercopy(const char *name, unsigned int size, > + unsigned int align, slab_flags_t flags, > + unsigned int useroffset, unsigned int usersize, > + void (*ctor)(void *)) > +{ > + return do_kmem_cache_create_usercopy(name, size, 0, align, flags, > + useroffset, usersize, ctor); > +} > EXPORT_SYMBOL(kmem_cache_create_usercopy); > > /** > @@ -387,11 +405,47 @@ struct kmem_cache * > kmem_cache_create(const char *name, unsigned int size, unsigned int align, > slab_flags_t flags, void (*ctor)(void *)) > { > - return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, > - ctor); > + return do_kmem_cache_create_usercopy(name, size, 0, align, flags, 0, 0, > + ctor); > } > EXPORT_SYMBOL(kmem_cache_create); > > +/** > + * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache. > + * @name: A string which is used in /proc/slabinfo to identify this cache. > + * @size: The size of objects to be created in this cache. > + * @offset: The offset into the memory to the free pointer > + * @flags: SLAB flags > + * @ctor: A constructor for the objects. > + * > + * Cannot be called within a interrupt, but can be interrupted. > + * The @ctor is run when new pages are allocated by the cache. > + * > + * The flags are > + * > + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) > + * to catch references to uninitialised memory. > + * > + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check > + * for buffer overruns. > + * > + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware > + * cacheline. This can be beneficial if you're counting cycles as closely > + * as davem. > + * > + * Return: a pointer to the cache on success, NULL on failure. > + */ > +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, > + unsigned int offset, > + slab_flags_t flags, > + void (*ctor)(void *)) > +{ > + return do_kmem_cache_create_usercopy(name, size, offset, 0, > + flags | SLAB_TYPESAFE_BY_RCU, 0, 0, > + ctor); > +} > +EXPORT_SYMBOL(kmem_cache_create_rcu); > + > static struct kmem_cache *kmem_buckets_cache __ro_after_init; > > /** > diff --git a/mm/slub.c b/mm/slub.c > index c9d8a2497fd6..34eac3f9a46e 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -3926,7 +3926,7 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, > void *obj) > { > if (unlikely(slab_want_init_on_free(s)) && obj && > - !freeptr_outside_object(s)) > + !freeptr_outside_object(s) && !s->dedicated_offset) > memset((void *)((char *)kasan_reset_tag(obj) + s->offset), > 0, sizeof(void *)); > } > @@ -5153,6 +5153,7 @@ static int calculate_sizes(struct kmem_cache *s) > slab_flags_t flags = s->flags; > unsigned int size = s->object_size; > unsigned int order; > + bool must_use_freeptr_offset; > > /* > * Round up object size to the next word boundary. We can only > @@ -5189,9 +5190,12 @@ static int calculate_sizes(struct kmem_cache *s) > */ > s->inuse = size; > > - if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor || > - ((flags & SLAB_RED_ZONE) && > - (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { > + must_use_freeptr_offset = > + (flags & SLAB_POISON) || s->ctor || > + ((flags & SLAB_RED_ZONE) && > + (s->object_size < sizeof(void *) || slub_debug_orig_size(s))); > + > + if ((flags & SLAB_TYPESAFE_BY_RCU) || must_use_freeptr_offset) { > /* > * Relocate free pointer after the object if it is not > * permitted to overwrite the first word of the object on > @@ -5208,8 +5212,13 @@ static int calculate_sizes(struct kmem_cache *s) > * freeptr_outside_object() function. If that is no > * longer true, the function needs to be modified. > */ > - s->offset = size; > - size += sizeof(void *); > + if (!(flags & SLAB_TYPESAFE_BY_RCU) || must_use_freeptr_offset) { > + s->offset = size; > + size += sizeof(void *); > + s->dedicated_offset = false; > + } else { > + s->dedicated_offset = true; Hmm, this seem to set s->dedicated_offset for any SLAB_TYPESAFE_BY_RCU cache, even those that weren't created with kmem_cache_create_rcu(). Shouldn't we have must_use_freeptr_offset = ((flags & SLAB_TYPESAFE_BY_RCU) && !s->dedicated_offset) || (flags & SLAB_POISON) || s->ctor || ((flags & SLAB_RED_ZONE) && (s->object_size < sizeof(void *) || slub_debug_orig_size(s))); if (must_use_freeptr_offset) { ... } > + } > } else { > /* > * Store freelist pointer near middle of object to keep > @@ -5301,6 +5310,7 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) > if (get_order(s->size) > get_order(s->object_size)) { > s->flags &= ~DEBUG_METADATA_FLAGS; > s->offset = 0; > + s->dedicated_offset = false; > if (!calculate_sizes(s)) > goto error; > } > -- > 2.45.2 > > -- Sincerely yours, Mike.