On Tue, Aug 27, 2024 at 01:42:30PM GMT, Mike Rapoport wrote: > On Mon, Aug 26, 2024 at 06:04:13PM +0200, Christian Brauner wrote: > > When a kmem cache is created with SLAB_TYPESAFE_BY_RCU the free pointer > > must be located outside of the object because we don't know what part of > > the memory can safely be overwritten as it may be needed to prevent > > object recycling. > > > > That has the consequence that SLAB_TYPESAFE_BY_RCU may end up adding a > > new cacheline. This is the case for .e.g, struct file. After having it > > shrunk down by 40 bytes and having it fit in three cachelines we still > > have SLAB_TYPESAFE_BY_RCU adding a fourth cacheline because it needs to > > accomodate the free pointer and is hardware cacheline aligned. > > > > I tried to find ways to rectify this as struct file is pretty much > > everywhere and having it use less memory is a good thing. So here's a > > proposal that might be totally the wrong api and broken but I thought I > > give it a try. > > > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> > > --- > > fs/file_table.c | 7 ++-- > > include/linux/fs.h | 1 + > > include/linux/slab.h | 4 +++ > > mm/slab.h | 1 + > > mm/slab_common.c | 76 +++++++++++++++++++++++++++++++++++++------- > > mm/slub.c | 22 +++++++++---- > > 6 files changed, 91 insertions(+), 20 deletions(-) > > > > diff --git a/fs/file_table.c b/fs/file_table.c > > index 694199a1a966..a69b8a71eacb 100644 > > --- a/fs/file_table.c > > +++ b/fs/file_table.c > > @@ -514,9 +514,10 @@ EXPORT_SYMBOL(__fput_sync); > > > > void __init files_init(void) > > { > > - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, > > - SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | > > - SLAB_PANIC | SLAB_ACCOUNT, NULL); > > + filp_cachep = kmem_cache_create_rcu("filp", sizeof(struct file), > > + offsetof(struct file, __f_slab_free_ptr), > > + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, > > + NULL); > > percpu_counter_init(&nr_files, 0, GFP_KERNEL); > > } > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h > > index 61097a9cf317..de509f5d1446 100644 > > --- a/include/linux/fs.h > > +++ b/include/linux/fs.h > > @@ -1057,6 +1057,7 @@ struct file { > > struct callback_head f_task_work; > > struct llist_node f_llist; > > struct file_ra_state f_ra; > > + void *__f_slab_free_ptr; > > }; > > /* --- cacheline 3 boundary (192 bytes) --- */ > > } __randomize_layout > > diff --git a/include/linux/slab.h b/include/linux/slab.h > > index eb2bf4629157..fc3c3cc9f689 100644 > > --- a/include/linux/slab.h > > +++ b/include/linux/slab.h > > @@ -242,6 +242,10 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, > > slab_flags_t flags, > > unsigned int useroffset, unsigned int usersize, > > void (*ctor)(void *)); > > +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, > > + unsigned int offset, > > Just 'offset' is to vague, no? > Maybe freeptr_offset? Yep, switched to that. > > > + slab_flags_t flags, > > + void (*ctor)(void *)); > > void kmem_cache_destroy(struct kmem_cache *s); > > int kmem_cache_shrink(struct kmem_cache *s); > > > > diff --git a/mm/slab.h b/mm/slab.h > > index dcdb56b8e7f5..122ca41fea34 100644 > > --- a/mm/slab.h > > +++ b/mm/slab.h > > @@ -261,6 +261,7 @@ struct kmem_cache { > > unsigned int object_size; /* Object size without metadata */ > > struct reciprocal_value reciprocal_size; > > unsigned int offset; /* Free pointer offset */ > > + bool dedicated_offset; /* Specific free pointer requested */ > > has_freeptr_offset? Your comments made me think that this should just be rcu_freeptr_offset and avoid that boolean completely. > > > #ifdef CONFIG_SLUB_CPU_PARTIAL > > /* Number of per cpu partial objects to keep around */ > > unsigned int cpu_partial; > > diff --git a/mm/slab_common.c b/mm/slab_common.c > > index 40b582a014b8..b6ca63859b3a 100644 > > --- a/mm/slab_common.c > > +++ b/mm/slab_common.c > > @@ -202,10 +202,10 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, > > } > > > > static struct kmem_cache *create_cache(const char *name, > > - unsigned int object_size, unsigned int align, > > - slab_flags_t flags, unsigned int useroffset, > > - unsigned int usersize, void (*ctor)(void *), > > - struct kmem_cache *root_cache) > > + unsigned int object_size, unsigned int offset, > > + unsigned int align, slab_flags_t flags, > > + unsigned int useroffset, unsigned int usersize, > > + void (*ctor)(void *), struct kmem_cache *root_cache) > > { > > struct kmem_cache *s; > > int err; > > @@ -213,6 +213,10 @@ static struct kmem_cache *create_cache(const char *name, > > if (WARN_ON(useroffset + usersize > object_size)) > > useroffset = usersize = 0; > > > > + if (WARN_ON(offset >= object_size || > > + (offset && !(flags & SLAB_TYPESAFE_BY_RCU)))) > > + offset = 0; > > + > > err = -ENOMEM; > > s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); > > if (!s) > > @@ -226,6 +230,10 @@ static struct kmem_cache *create_cache(const char *name, > > s->useroffset = useroffset; > > s->usersize = usersize; > > #endif > > + if (offset > 0) { > > + s->offset = offset; > > + s->dedicated_offset = true; > > + } > > > > err = __kmem_cache_create(s, flags); > > if (err) > > @@ -269,10 +277,10 @@ static struct kmem_cache *create_cache(const char *name, > > * > > * Return: a pointer to the cache on success, NULL on failure. > > */ > > -struct kmem_cache * > > -kmem_cache_create_usercopy(const char *name, > > - unsigned int size, unsigned int align, > > - slab_flags_t flags, > > +static struct kmem_cache * > > +do_kmem_cache_create_usercopy(const char *name, > > + unsigned int size, unsigned int offset, > > + unsigned int align, slab_flags_t flags, > > unsigned int useroffset, unsigned int usersize, > > void (*ctor)(void *)) > > { > > @@ -332,7 +340,7 @@ kmem_cache_create_usercopy(const char *name, > > goto out_unlock; > > } > > > > - s = create_cache(cache_name, size, > > + s = create_cache(cache_name, size, offset, > > calculate_alignment(flags, align, size), > > flags, useroffset, usersize, ctor, NULL); > > if (IS_ERR(s)) { > > @@ -356,6 +364,16 @@ kmem_cache_create_usercopy(const char *name, > > } > > return s; > > } > > + > > +struct kmem_cache * > > +kmem_cache_create_usercopy(const char *name, unsigned int size, > > + unsigned int align, slab_flags_t flags, > > + unsigned int useroffset, unsigned int usersize, > > + void (*ctor)(void *)) > > +{ > > + return do_kmem_cache_create_usercopy(name, size, 0, align, flags, > > + useroffset, usersize, ctor); > > +} > > EXPORT_SYMBOL(kmem_cache_create_usercopy); > > > > /** > > @@ -387,11 +405,47 @@ struct kmem_cache * > > kmem_cache_create(const char *name, unsigned int size, unsigned int align, > > slab_flags_t flags, void (*ctor)(void *)) > > { > > - return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, > > - ctor); > > + return do_kmem_cache_create_usercopy(name, size, 0, align, flags, 0, 0, > > + ctor); > > } > > EXPORT_SYMBOL(kmem_cache_create); > > > > +/** > > + * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache. > > + * @name: A string which is used in /proc/slabinfo to identify this cache. > > + * @size: The size of objects to be created in this cache. > > + * @offset: The offset into the memory to the free pointer > > + * @flags: SLAB flags > > + * @ctor: A constructor for the objects. > > + * > > + * Cannot be called within a interrupt, but can be interrupted. > > + * The @ctor is run when new pages are allocated by the cache. > > + * > > + * The flags are > > + * > > + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) > > + * to catch references to uninitialised memory. > > + * > > + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check > > + * for buffer overruns. > > + * > > + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware > > + * cacheline. This can be beneficial if you're counting cycles as closely > > + * as davem. > > + * > > + * Return: a pointer to the cache on success, NULL on failure. > > + */ > > +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, > > + unsigned int offset, > > + slab_flags_t flags, > > + void (*ctor)(void *)) > > +{ > > + return do_kmem_cache_create_usercopy(name, size, offset, 0, > > + flags | SLAB_TYPESAFE_BY_RCU, 0, 0, > > + ctor); > > +} > > +EXPORT_SYMBOL(kmem_cache_create_rcu); > > + > > static struct kmem_cache *kmem_buckets_cache __ro_after_init; > > > > /** > > diff --git a/mm/slub.c b/mm/slub.c > > index c9d8a2497fd6..34eac3f9a46e 100644 > > --- a/mm/slub.c > > +++ b/mm/slub.c > > @@ -3926,7 +3926,7 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, > > void *obj) > > { > > if (unlikely(slab_want_init_on_free(s)) && obj && > > - !freeptr_outside_object(s)) > > + !freeptr_outside_object(s) && !s->dedicated_offset) > > memset((void *)((char *)kasan_reset_tag(obj) + s->offset), > > 0, sizeof(void *)); > > } > > @@ -5153,6 +5153,7 @@ static int calculate_sizes(struct kmem_cache *s) > > slab_flags_t flags = s->flags; > > unsigned int size = s->object_size; > > unsigned int order; > > + bool must_use_freeptr_offset; > > > > /* > > * Round up object size to the next word boundary. We can only > > @@ -5189,9 +5190,12 @@ static int calculate_sizes(struct kmem_cache *s) > > */ > > s->inuse = size; > > > > - if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor || > > - ((flags & SLAB_RED_ZONE) && > > - (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { > > + must_use_freeptr_offset = > > + (flags & SLAB_POISON) || s->ctor || > > + ((flags & SLAB_RED_ZONE) && > > + (s->object_size < sizeof(void *) || slub_debug_orig_size(s))); > > + > > + if ((flags & SLAB_TYPESAFE_BY_RCU) || must_use_freeptr_offset) { > > /* > > * Relocate free pointer after the object if it is not > > * permitted to overwrite the first word of the object on > > @@ -5208,8 +5212,13 @@ static int calculate_sizes(struct kmem_cache *s) > > * freeptr_outside_object() function. If that is no > > * longer true, the function needs to be modified. > > */ > > - s->offset = size; > > - size += sizeof(void *); > > + if (!(flags & SLAB_TYPESAFE_BY_RCU) || must_use_freeptr_offset) { > > + s->offset = size; > > + size += sizeof(void *); > > + s->dedicated_offset = false; > > + } else { > > + s->dedicated_offset = true; > > Hmm, this seem to set s->dedicated_offset for any SLAB_TYPESAFE_BY_RCU cache, > even those that weren't created with kmem_cache_create_rcu(). > > Shouldn't we have > > must_use_freeptr_offset = > ((flags & SLAB_TYPESAFE_BY_RCU) && !s->dedicated_offset) || > (flags & SLAB_POISON) || s->ctor || > ((flags & SLAB_RED_ZONE) && > (s->object_size < sizeof(void *) || slub_debug_orig_size(s))); > > if (must_use_freeptr_offset) { > ... > } Yep, that's better. Will send a fixed version.