Re: [PATCH v3 2/3] mm: add kmem_cache_create_rcu()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Aug 28, 2024 at 12:56:24PM +0200, Christian Brauner wrote:
> When a kmem cache is created with SLAB_TYPESAFE_BY_RCU the free pointer
> must be located outside of the object because we don't know what part of
> the memory can safely be overwritten as it may be needed to prevent
> object recycling.
> 
> That has the consequence that SLAB_TYPESAFE_BY_RCU may end up adding a
> new cacheline. This is the case for e.g., struct file. After having it
> shrunk down by 40 bytes and having it fit in three cachelines we still
> have SLAB_TYPESAFE_BY_RCU adding a fourth cacheline because it needs to
> accommodate the free pointer.
> 
> Add a new kmem_cache_create_rcu() function that allows the caller to
> specify an offset where the free pointer is supposed to be placed.
> 
> Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>

FWIW
Acked-by: Mike Rapoport (Microsoft) <rppt@xxxxxxxxxx>

> ---
>  include/linux/slab.h |   9 ++++
>  mm/slab.h            |   2 +
>  mm/slab_common.c     | 136 ++++++++++++++++++++++++++++++++++++---------------
>  mm/slub.c            |  20 +++++---
>  4 files changed, 121 insertions(+), 46 deletions(-)
> 
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index eb2bf4629157..5b2da2cf31a8 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -212,6 +212,12 @@ enum _slab_flag_bits {
>  #define SLAB_NO_OBJ_EXT		__SLAB_FLAG_UNUSED
>  #endif
>  
> +/*
> + * freeptr_t represents a SLUB freelist pointer, which might be encoded
> + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> + */
> +typedef struct { unsigned long v; } freeptr_t;
> +
>  /*
>   * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
>   *
> @@ -242,6 +248,9 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
>  			slab_flags_t flags,
>  			unsigned int useroffset, unsigned int usersize,
>  			void (*ctor)(void *));
> +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size,
> +					 unsigned int freeptr_offset,
> +					 slab_flags_t flags);
>  void kmem_cache_destroy(struct kmem_cache *s);
>  int kmem_cache_shrink(struct kmem_cache *s);
>  
> diff --git a/mm/slab.h b/mm/slab.h
> index dcdb56b8e7f5..a6051385186e 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -261,6 +261,8 @@ struct kmem_cache {
>  	unsigned int object_size;	/* Object size without metadata */
>  	struct reciprocal_value reciprocal_size;
>  	unsigned int offset;		/* Free pointer offset */
> +	/* Specific free pointer requested (if not UINT_MAX) */
> +	unsigned int rcu_freeptr_offset;
>  #ifdef CONFIG_SLUB_CPU_PARTIAL
>  	/* Number of per cpu partial objects to keep around */
>  	unsigned int cpu_partial;
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index c8dd7e08c5f6..887f6b9855dd 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -202,9 +202,10 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
>  }
>  
>  static struct kmem_cache *create_cache(const char *name,
> -		unsigned int object_size, unsigned int align,
> -		slab_flags_t flags, unsigned int useroffset,
> -		unsigned int usersize, void (*ctor)(void *))
> +		unsigned int object_size, unsigned int freeptr_offset,
> +		unsigned int align, slab_flags_t flags,
> +		unsigned int useroffset, unsigned int usersize,
> +		void (*ctor)(void *))
>  {
>  	struct kmem_cache *s;
>  	int err;
> @@ -212,6 +213,13 @@ static struct kmem_cache *create_cache(const char *name,
>  	if (WARN_ON(useroffset + usersize > object_size))
>  		useroffset = usersize = 0;
>  
> +	/* If a custom freelist pointer is requested make sure it's sane. */
> +	err = -EINVAL;
> +	if (freeptr_offset != UINT_MAX &&
> +	    (freeptr_offset >= object_size || !(flags & SLAB_TYPESAFE_BY_RCU) ||
> +	     !IS_ALIGNED(freeptr_offset, sizeof(freeptr_t))))
> +		goto out;
> +
>  	err = -ENOMEM;
>  	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
>  	if (!s)
> @@ -219,13 +227,13 @@ static struct kmem_cache *create_cache(const char *name,
>  
>  	s->name = name;
>  	s->size = s->object_size = object_size;
> +	s->rcu_freeptr_offset = freeptr_offset;
>  	s->align = align;
>  	s->ctor = ctor;
>  #ifdef CONFIG_HARDENED_USERCOPY
>  	s->useroffset = useroffset;
>  	s->usersize = usersize;
>  #endif
> -
>  	err = __kmem_cache_create(s, flags);
>  	if (err)
>  		goto out_free_cache;
> @@ -240,38 +248,10 @@ static struct kmem_cache *create_cache(const char *name,
>  	return ERR_PTR(err);
>  }
>  
> -/**
> - * kmem_cache_create_usercopy - Create a cache with a region suitable
> - * for copying to userspace
> - * @name: A string which is used in /proc/slabinfo to identify this cache.
> - * @size: The size of objects to be created in this cache.
> - * @align: The required alignment for the objects.
> - * @flags: SLAB flags
> - * @useroffset: Usercopy region offset
> - * @usersize: Usercopy region size
> - * @ctor: A constructor for the objects.
> - *
> - * Cannot be called within a interrupt, but can be interrupted.
> - * The @ctor is run when new pages are allocated by the cache.
> - *
> - * The flags are
> - *
> - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
> - * to catch references to uninitialised memory.
> - *
> - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
> - * for buffer overruns.
> - *
> - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
> - * cacheline.  This can be beneficial if you're counting cycles as closely
> - * as davem.
> - *
> - * Return: a pointer to the cache on success, NULL on failure.
> - */
> -struct kmem_cache *
> -kmem_cache_create_usercopy(const char *name,
> -		  unsigned int size, unsigned int align,
> -		  slab_flags_t flags,
> +static struct kmem_cache *
> +do_kmem_cache_create_usercopy(const char *name,
> +		  unsigned int size, unsigned int freeptr_offset,
> +		  unsigned int align, slab_flags_t flags,
>  		  unsigned int useroffset, unsigned int usersize,
>  		  void (*ctor)(void *))
>  {
> @@ -331,7 +311,7 @@ kmem_cache_create_usercopy(const char *name,
>  		goto out_unlock;
>  	}
>  
> -	s = create_cache(cache_name, size,
> +	s = create_cache(cache_name, size, freeptr_offset,
>  			 calculate_alignment(flags, align, size),
>  			 flags, useroffset, usersize, ctor);
>  	if (IS_ERR(s)) {
> @@ -355,6 +335,45 @@ kmem_cache_create_usercopy(const char *name,
>  	}
>  	return s;
>  }
> +
> +/**
> + * kmem_cache_create_usercopy - Create a cache with a region suitable
> + * for copying to userspace
> + * @name: A string which is used in /proc/slabinfo to identify this cache.
> + * @size: The size of objects to be created in this cache.
> + * @freeptr_offset: Custom offset for the free pointer in RCU caches
> + * @align: The required alignment for the objects.
> + * @flags: SLAB flags
> + * @useroffset: Usercopy region offset
> + * @usersize: Usercopy region size
> + * @ctor: A constructor for the objects.
> + *
> + * Cannot be called within a interrupt, but can be interrupted.
> + * The @ctor is run when new pages are allocated by the cache.
> + *
> + * The flags are
> + *
> + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
> + * to catch references to uninitialised memory.
> + *
> + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
> + * for buffer overruns.
> + *
> + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
> + * cacheline.  This can be beneficial if you're counting cycles as closely
> + * as davem.
> + *
> + * Return: a pointer to the cache on success, NULL on failure.
> + */
> +struct kmem_cache *
> +kmem_cache_create_usercopy(const char *name, unsigned int size,
> +			   unsigned int align, slab_flags_t flags,
> +			   unsigned int useroffset, unsigned int usersize,
> +			   void (*ctor)(void *))
> +{
> +	return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags,
> +					     useroffset, usersize, ctor);
> +}
>  EXPORT_SYMBOL(kmem_cache_create_usercopy);
>  
>  /**
> @@ -386,11 +405,50 @@ struct kmem_cache *
>  kmem_cache_create(const char *name, unsigned int size, unsigned int align,
>  		slab_flags_t flags, void (*ctor)(void *))
>  {
> -	return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
> -					  ctor);
> +	return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags,
> +					     0, 0, ctor);
>  }
>  EXPORT_SYMBOL(kmem_cache_create);
>  
> +/**
> + * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache.
> + * @name: A string which is used in /proc/slabinfo to identify this cache.
> + * @size: The size of objects to be created in this cache.
> + * @freeptr_offset: The offset into the memory to the free pointer
> + * @flags: SLAB flags
> + *
> + * Cannot be called within an interrupt, but can be interrupted.
> + *
> + * See kmem_cache_create() for an explanation of possible @flags.
> + *
> + * By default SLAB_TYPESAFE_BY_RCU caches place the free pointer outside
> + * of the object. This might cause the object to grow in size. Callers
> + * that have a reason to avoid this can specify a custom free pointer
> + * offset in their struct where the free pointer will be placed.
> + *
> + * Note that placing the free pointer inside the object requires the
> + * caller to ensure that no fields are invalidated that are required to
> + * guard against object recycling (See SLAB_TYPESAFE_BY_RCU for
> + * details.).
> + *
> + * Using zero as a value for @freeptr_offset is valid. To request no
> + * offset UINT_MAX must be specified.
> + *
> + * Note that @ctor isn't supported with custom free pointers as a @ctor
> + * requires an external free pointer.
> + *
> + * Return: a pointer to the cache on success, NULL on failure.
> + */
> +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size,
> +					 unsigned int freeptr_offset,
> +					 slab_flags_t flags)
> +{
> +	return do_kmem_cache_create_usercopy(name, size, freeptr_offset, 0,
> +					     flags | SLAB_TYPESAFE_BY_RCU, 0, 0,
> +					     NULL);
> +}
> +EXPORT_SYMBOL(kmem_cache_create_rcu);
> +
>  static struct kmem_cache *kmem_buckets_cache __ro_after_init;
>  
>  /**
> diff --git a/mm/slub.c b/mm/slub.c
> index c9d8a2497fd6..9aa5da1e8e27 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -465,12 +465,6 @@ static struct workqueue_struct *flushwq;
>   * 			Core slab cache functions
>   *******************************************************************/
>  
> -/*
> - * freeptr_t represents a SLUB freelist pointer, which might be encoded
> - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> - */
> -typedef struct { unsigned long v; } freeptr_t;
> -
>  /*
>   * Returns freelist pointer (ptr). With hardening, this is obfuscated
>   * with an XOR of the address where the pointer is held and a per-cache
> @@ -3921,6 +3915,9 @@ static void *__slab_alloc_node(struct kmem_cache *s,
>  /*
>   * If the object has been wiped upon free, make sure it's fully initialized by
>   * zeroing out freelist pointer.
> + *
> + * Note that we also wipe custom freelist pointers specified via
> + * s->rcu_freeptr_offset.
>   */
>  static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
>  						   void *obj)
> @@ -5144,6 +5141,12 @@ static void set_cpu_partial(struct kmem_cache *s)
>  #endif
>  }
>  
> +/* Was a valid freeptr offset requested? */
> +static inline bool has_freeptr_offset(const struct kmem_cache *s)
> +{
> +	return s->rcu_freeptr_offset != UINT_MAX;
> +}
> +
>  /*
>   * calculate_sizes() determines the order and the distribution of data within
>   * a slab object.
> @@ -5189,7 +5192,8 @@ static int calculate_sizes(struct kmem_cache *s)
>  	 */
>  	s->inuse = size;
>  
> -	if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor ||
> +	if (((flags & SLAB_TYPESAFE_BY_RCU) && !has_freeptr_offset(s)) ||
> +	    (flags & SLAB_POISON) || s->ctor ||
>  	    ((flags & SLAB_RED_ZONE) &&
>  	     (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
>  		/*
> @@ -5210,6 +5214,8 @@ static int calculate_sizes(struct kmem_cache *s)
>  		 */
>  		s->offset = size;
>  		size += sizeof(void *);
> +	} else if ((flags & SLAB_TYPESAFE_BY_RCU) && has_freeptr_offset(s)) {
> +		s->offset = s->rcu_freeptr_offset;
>  	} else {
>  		/*
>  		 * Store freelist pointer near middle of object to keep
> 
> -- 
> 2.45.2
> 

-- 
Sincerely yours,
Mike.




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux