On Wed, Aug 28, 2024 at 12:56:24PM +0200, Christian Brauner wrote: > When a kmem cache is created with SLAB_TYPESAFE_BY_RCU the free pointer > must be located outside of the object because we don't know what part of > the memory can safely be overwritten as it may be needed to prevent > object recycling. > > That has the consequence that SLAB_TYPESAFE_BY_RCU may end up adding a > new cacheline. This is the case for e.g., struct file. After having it > shrunk down by 40 bytes and having it fit in three cachelines we still > have SLAB_TYPESAFE_BY_RCU adding a fourth cacheline because it needs to > accommodate the free pointer. > > Add a new kmem_cache_create_rcu() function that allows the caller to > specify an offset where the free pointer is supposed to be placed. > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> FWIW Acked-by: Mike Rapoport (Microsoft) <rppt@xxxxxxxxxx> > --- > include/linux/slab.h | 9 ++++ > mm/slab.h | 2 + > mm/slab_common.c | 136 ++++++++++++++++++++++++++++++++++++--------------- > mm/slub.c | 20 +++++--- > 4 files changed, 121 insertions(+), 46 deletions(-) > > diff --git a/include/linux/slab.h b/include/linux/slab.h > index eb2bf4629157..5b2da2cf31a8 100644 > --- a/include/linux/slab.h > +++ b/include/linux/slab.h > @@ -212,6 +212,12 @@ enum _slab_flag_bits { > #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED > #endif > > +/* > + * freeptr_t represents a SLUB freelist pointer, which might be encoded > + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. > + */ > +typedef struct { unsigned long v; } freeptr_t; > + > /* > * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. > * > @@ -242,6 +248,9 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, > slab_flags_t flags, > unsigned int useroffset, unsigned int usersize, > void (*ctor)(void *)); > +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, > + unsigned int freeptr_offset, > + slab_flags_t flags); > void kmem_cache_destroy(struct kmem_cache *s); > int kmem_cache_shrink(struct kmem_cache *s); > > diff --git a/mm/slab.h b/mm/slab.h > index dcdb56b8e7f5..a6051385186e 100644 > --- a/mm/slab.h > +++ b/mm/slab.h > @@ -261,6 +261,8 @@ struct kmem_cache { > unsigned int object_size; /* Object size without metadata */ > struct reciprocal_value reciprocal_size; > unsigned int offset; /* Free pointer offset */ > + /* Specific free pointer requested (if not UINT_MAX) */ > + unsigned int rcu_freeptr_offset; > #ifdef CONFIG_SLUB_CPU_PARTIAL > /* Number of per cpu partial objects to keep around */ > unsigned int cpu_partial; > diff --git a/mm/slab_common.c b/mm/slab_common.c > index c8dd7e08c5f6..887f6b9855dd 100644 > --- a/mm/slab_common.c > +++ b/mm/slab_common.c > @@ -202,9 +202,10 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, > } > > static struct kmem_cache *create_cache(const char *name, > - unsigned int object_size, unsigned int align, > - slab_flags_t flags, unsigned int useroffset, > - unsigned int usersize, void (*ctor)(void *)) > + unsigned int object_size, unsigned int freeptr_offset, > + unsigned int align, slab_flags_t flags, > + unsigned int useroffset, unsigned int usersize, > + void (*ctor)(void *)) > { > struct kmem_cache *s; > int err; > @@ -212,6 +213,13 @@ static struct kmem_cache *create_cache(const char *name, > if (WARN_ON(useroffset + usersize > object_size)) > useroffset = usersize = 0; > > + /* If a custom freelist pointer is requested make sure it's sane. */ > + err = -EINVAL; > + if (freeptr_offset != UINT_MAX && > + (freeptr_offset >= object_size || !(flags & SLAB_TYPESAFE_BY_RCU) || > + !IS_ALIGNED(freeptr_offset, sizeof(freeptr_t)))) > + goto out; > + > err = -ENOMEM; > s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); > if (!s) > @@ -219,13 +227,13 @@ static struct kmem_cache *create_cache(const char *name, > > s->name = name; > s->size = s->object_size = object_size; > + s->rcu_freeptr_offset = freeptr_offset; > s->align = align; > s->ctor = ctor; > #ifdef CONFIG_HARDENED_USERCOPY > s->useroffset = useroffset; > s->usersize = usersize; > #endif > - > err = __kmem_cache_create(s, flags); > if (err) > goto out_free_cache; > @@ -240,38 +248,10 @@ static struct kmem_cache *create_cache(const char *name, > return ERR_PTR(err); > } > > -/** > - * kmem_cache_create_usercopy - Create a cache with a region suitable > - * for copying to userspace > - * @name: A string which is used in /proc/slabinfo to identify this cache. > - * @size: The size of objects to be created in this cache. > - * @align: The required alignment for the objects. > - * @flags: SLAB flags > - * @useroffset: Usercopy region offset > - * @usersize: Usercopy region size > - * @ctor: A constructor for the objects. > - * > - * Cannot be called within a interrupt, but can be interrupted. > - * The @ctor is run when new pages are allocated by the cache. > - * > - * The flags are > - * > - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) > - * to catch references to uninitialised memory. > - * > - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check > - * for buffer overruns. > - * > - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware > - * cacheline. This can be beneficial if you're counting cycles as closely > - * as davem. > - * > - * Return: a pointer to the cache on success, NULL on failure. > - */ > -struct kmem_cache * > -kmem_cache_create_usercopy(const char *name, > - unsigned int size, unsigned int align, > - slab_flags_t flags, > +static struct kmem_cache * > +do_kmem_cache_create_usercopy(const char *name, > + unsigned int size, unsigned int freeptr_offset, > + unsigned int align, slab_flags_t flags, > unsigned int useroffset, unsigned int usersize, > void (*ctor)(void *)) > { > @@ -331,7 +311,7 @@ kmem_cache_create_usercopy(const char *name, > goto out_unlock; > } > > - s = create_cache(cache_name, size, > + s = create_cache(cache_name, size, freeptr_offset, > calculate_alignment(flags, align, size), > flags, useroffset, usersize, ctor); > if (IS_ERR(s)) { > @@ -355,6 +335,45 @@ kmem_cache_create_usercopy(const char *name, > } > return s; > } > + > +/** > + * kmem_cache_create_usercopy - Create a cache with a region suitable > + * for copying to userspace > + * @name: A string which is used in /proc/slabinfo to identify this cache. > + * @size: The size of objects to be created in this cache. > + * @freeptr_offset: Custom offset for the free pointer in RCU caches > + * @align: The required alignment for the objects. > + * @flags: SLAB flags > + * @useroffset: Usercopy region offset > + * @usersize: Usercopy region size > + * @ctor: A constructor for the objects. > + * > + * Cannot be called within a interrupt, but can be interrupted. > + * The @ctor is run when new pages are allocated by the cache. > + * > + * The flags are > + * > + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) > + * to catch references to uninitialised memory. > + * > + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check > + * for buffer overruns. > + * > + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware > + * cacheline. This can be beneficial if you're counting cycles as closely > + * as davem. > + * > + * Return: a pointer to the cache on success, NULL on failure. > + */ > +struct kmem_cache * > +kmem_cache_create_usercopy(const char *name, unsigned int size, > + unsigned int align, slab_flags_t flags, > + unsigned int useroffset, unsigned int usersize, > + void (*ctor)(void *)) > +{ > + return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, > + useroffset, usersize, ctor); > +} > EXPORT_SYMBOL(kmem_cache_create_usercopy); > > /** > @@ -386,11 +405,50 @@ struct kmem_cache * > kmem_cache_create(const char *name, unsigned int size, unsigned int align, > slab_flags_t flags, void (*ctor)(void *)) > { > - return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, > - ctor); > + return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, > + 0, 0, ctor); > } > EXPORT_SYMBOL(kmem_cache_create); > > +/** > + * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache. > + * @name: A string which is used in /proc/slabinfo to identify this cache. > + * @size: The size of objects to be created in this cache. > + * @freeptr_offset: The offset into the memory to the free pointer > + * @flags: SLAB flags > + * > + * Cannot be called within an interrupt, but can be interrupted. > + * > + * See kmem_cache_create() for an explanation of possible @flags. > + * > + * By default SLAB_TYPESAFE_BY_RCU caches place the free pointer outside > + * of the object. This might cause the object to grow in size. Callers > + * that have a reason to avoid this can specify a custom free pointer > + * offset in their struct where the free pointer will be placed. > + * > + * Note that placing the free pointer inside the object requires the > + * caller to ensure that no fields are invalidated that are required to > + * guard against object recycling (See SLAB_TYPESAFE_BY_RCU for > + * details.). > + * > + * Using zero as a value for @freeptr_offset is valid. To request no > + * offset UINT_MAX must be specified. > + * > + * Note that @ctor isn't supported with custom free pointers as a @ctor > + * requires an external free pointer. > + * > + * Return: a pointer to the cache on success, NULL on failure. > + */ > +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, > + unsigned int freeptr_offset, > + slab_flags_t flags) > +{ > + return do_kmem_cache_create_usercopy(name, size, freeptr_offset, 0, > + flags | SLAB_TYPESAFE_BY_RCU, 0, 0, > + NULL); > +} > +EXPORT_SYMBOL(kmem_cache_create_rcu); > + > static struct kmem_cache *kmem_buckets_cache __ro_after_init; > > /** > diff --git a/mm/slub.c b/mm/slub.c > index c9d8a2497fd6..9aa5da1e8e27 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -465,12 +465,6 @@ static struct workqueue_struct *flushwq; > * Core slab cache functions > *******************************************************************/ > > -/* > - * freeptr_t represents a SLUB freelist pointer, which might be encoded > - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. > - */ > -typedef struct { unsigned long v; } freeptr_t; > - > /* > * Returns freelist pointer (ptr). With hardening, this is obfuscated > * with an XOR of the address where the pointer is held and a per-cache > @@ -3921,6 +3915,9 @@ static void *__slab_alloc_node(struct kmem_cache *s, > /* > * If the object has been wiped upon free, make sure it's fully initialized by > * zeroing out freelist pointer. > + * > + * Note that we also wipe custom freelist pointers specified via > + * s->rcu_freeptr_offset. > */ > static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, > void *obj) > @@ -5144,6 +5141,12 @@ static void set_cpu_partial(struct kmem_cache *s) > #endif > } > > +/* Was a valid freeptr offset requested? */ > +static inline bool has_freeptr_offset(const struct kmem_cache *s) > +{ > + return s->rcu_freeptr_offset != UINT_MAX; > +} > + > /* > * calculate_sizes() determines the order and the distribution of data within > * a slab object. > @@ -5189,7 +5192,8 @@ static int calculate_sizes(struct kmem_cache *s) > */ > s->inuse = size; > > - if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor || > + if (((flags & SLAB_TYPESAFE_BY_RCU) && !has_freeptr_offset(s)) || > + (flags & SLAB_POISON) || s->ctor || > ((flags & SLAB_RED_ZONE) && > (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { > /* > @@ -5210,6 +5214,8 @@ static int calculate_sizes(struct kmem_cache *s) > */ > s->offset = size; > size += sizeof(void *); > + } else if ((flags & SLAB_TYPESAFE_BY_RCU) && has_freeptr_offset(s)) { > + s->offset = s->rcu_freeptr_offset; > } else { > /* > * Store freelist pointer near middle of object to keep > > -- > 2.45.2 > -- Sincerely yours, Mike.