Re: [PATCH 5/5] slab: Allocate and use per-call-site caches

Suren Baghdasaryan <surenb@xxxxxxxxxx> · Thu, 29 Aug 2024 10:03:56 -0700

On Fri, Aug 9, 2024 at 12:33 AM Kees Cook <kees@xxxxxxxxxx> wrote:
>
> Use separate per-call-site kmem_cache or kmem_buckets. These are
> allocated on demand to avoid wasting memory for unused caches.
>
> A few caches need to be allocated very early to support allocating the
> caches themselves: kstrdup(), kvasprintf(), and pcpu_mem_zalloc(). Any
> GFP_ATOMIC allocations are currently left to be allocated from
> KMALLOC_NORMAL.
>
> With a distro config, /proc/slabinfo grows from ~400 entries to ~2200.
>
> Since this feature (CONFIG_SLAB_PER_SITE) is redundant to
> CONFIG_RANDOM_KMALLOC_CACHES, mark it a incompatible. Add Kconfig help
> text that compares the features.
>
> Improvements needed:
> - Retain call site gfp flags in alloc_tag meta field to:
>   - pre-allocate all GFP_ATOMIC caches (since their caches cannot
>     be allocated on demand unless we want them to be GFP_ATOMIC
>     themselves...)

I'm currently working on a feature to identify allocations with
__GFP_ACCOUNT known at compile time (similar to how you handle the
size in the previous patch). Might be something you can reuse/extend.

>   - Separate MEMCG allocations as well

Do you mean allocations with __GFP_ACCOUNT or something else?

> - Allocate individual caches within kmem_buckets on demand to
>   further reduce memory usage overhead.
>
> Signed-off-by: Kees Cook <kees@xxxxxxxxxx>
> ---
> Cc: Suren Baghdasaryan <surenb@xxxxxxxxxx>
> Cc: Kent Overstreet <kent.overstreet@xxxxxxxxx>
> Cc: Vlastimil Babka <vbabka@xxxxxxx>
> Cc: Christoph Lameter <cl@xxxxxxxxx>
> Cc: Pekka Enberg <penberg@xxxxxxxxxx>
> Cc: David Rientjes <rientjes@xxxxxxxxxx>
> Cc: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Cc: Roman Gushchin <roman.gushchin@xxxxxxxxx>
> Cc: Hyeonggon Yoo <42.hyeyoo@xxxxxxxxx>
> Cc: linux-mm@xxxxxxxxx
> ---
>  include/linux/alloc_tag.h |   8 +++
>  lib/alloc_tag.c           | 121 +++++++++++++++++++++++++++++++++++---
>  mm/Kconfig                |  19 +++++-
>  mm/slab_common.c          |   1 +
>  mm/slub.c                 |  31 +++++++++-
>  5 files changed, 170 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> index f5d8c5849b82..c95628f9b049 100644
> --- a/include/linux/alloc_tag.h
> +++ b/include/linux/alloc_tag.h
> @@ -24,6 +24,7 @@ struct alloc_tag_counters {
>  struct alloc_meta {
>         /* 0 means non-slab, SIZE_MAX means dynamic, and everything else is fixed-size. */
>         size_t sized;
> +       void *cache;

I see now where that meta.cache in the previous patch came from...
That part should be moved here.

>  };
>  #define ALLOC_META_INIT(_size) {               \
>                 .sized = (__builtin_constant_p(_size) ? (_size) : SIZE_MAX), \
> @@ -216,6 +217,13 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
>
>  #endif /* CONFIG_MEM_ALLOC_PROFILING */
>
> +#ifdef CONFIG_SLAB_PER_SITE
> +void alloc_tag_early_walk(void);
> +void alloc_tag_site_init(struct codetag *ct, bool ondemand);
> +#else
> +static inline void alloc_tag_early_walk(void) {}
> +#endif
> +
>  #define alloc_hooks_tag(_tag, _do_alloc)                               \
>  ({                                                                     \
>         struct alloc_tag * __maybe_unused _old = alloc_tag_save(_tag);  \
> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> index 6d2cb72bf269..e8a66a7c4a6b 100644
> --- a/lib/alloc_tag.c
> +++ b/lib/alloc_tag.c
> @@ -157,6 +157,89 @@ static void __init procfs_init(void)
>         proc_create_seq("allocinfo", 0400, NULL, &allocinfo_seq_op);
>  }
>
> +#ifdef CONFIG_SLAB_PER_SITE
> +static bool ondemand_ready;
> +
> +void alloc_tag_site_init(struct codetag *ct, bool ondemand)
> +{
> +       struct alloc_tag *tag = ct_to_alloc_tag(ct);
> +       char *name;
> +       void *p, *old;
> +
> +       /* Only handle kmalloc allocations. */
> +       if (!tag->meta.sized)
> +               return;
> +
> +       /* Must be ready for on-demand allocations. */
> +       if (ondemand && !ondemand_ready)
> +               return;
> +
> +       old = READ_ONCE(tag->meta.cache);
> +       /* Already allocated? */
> +       if (old)
> +               return;
> +
> +       if (tag->meta.sized < SIZE_MAX) {
> +               /* Fixed-size allocations. */
> +               name = kasprintf(GFP_KERNEL, "f:%zu:%s:%d", tag->meta.sized, ct->function, ct->lineno);
> +               if (WARN_ON_ONCE(!name))
> +                       return;
> +               /*
> +                * As with KMALLOC_NORMAL, the entire allocation needs to be
> +                * open to usercopy access. :(
> +                */
> +               p = kmem_cache_create_usercopy(name, tag->meta.sized, 0,
> +                                              SLAB_NO_MERGE, 0, tag->meta.sized,
> +                                              NULL);
> +       } else {
> +               /* Dynamically-size allocations. */
> +               name = kasprintf(GFP_KERNEL, "d:%s:%d", ct->function, ct->lineno);
> +               if (WARN_ON_ONCE(!name))
> +                       return;
> +               p = kmem_buckets_create(name, SLAB_NO_MERGE, 0, UINT_MAX, NULL);
> +       }
> +       if (p) {
> +               if (unlikely(!try_cmpxchg(&tag->meta.cache, &old, p))) {
> +                       /* We lost the allocation race; clean up. */
> +                       if (tag->meta.sized < SIZE_MAX)
> +                               kmem_cache_destroy(p);
> +                       else
> +                               kmem_buckets_destroy(p);
> +               }
> +       }
> +       kfree(name);
> +}
> +
> +static void alloc_tag_site_init_early(struct codetag *ct)
> +{
> +       /* Explicitly initialize the caches needed to initialize caches. */
> +       if (strcmp(ct->function, "kstrdup") == 0 ||
> +           strcmp(ct->function, "kvasprintf") == 0 ||
> +           strcmp(ct->function, "pcpu_mem_zalloc") == 0)

I hope we can find a better way to distinguish these allocations.
Maybe have a specialized hook for them, like alloc_hooks_early() which
sets a bit inside ct->flags to distinguish them?

> +               alloc_tag_site_init(ct, false);
> +
> +       /* TODO: pre-allocate GFP_ATOMIC caches here. */

You could pre-allocate GFP_ATOMIC caches during
alloc_tag_module_load() only if gfp_flags are known at compile time I
think. I guess for the dynamic case choose_slab() will fall back to
kmalloc_slab()?

> +}
> +#endif
> +
> +static void alloc_tag_module_load(struct codetag_type *cttype,
> +                                 struct codetag_module *cmod)
> +{
> +#ifdef CONFIG_SLAB_PER_SITE
> +       struct codetag_iterator iter;
> +       struct codetag *ct;
> +
> +       iter = codetag_get_ct_iter(cttype);
> +       for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
> +               if (iter.cmod != cmod)
> +                       continue;
> +
> +               /* TODO: pre-allocate GFP_ATOMIC caches here. */
> +               //alloc_tag_site_init(ct, false);
> +       }
> +#endif
> +}
> +
>  static bool alloc_tag_module_unload(struct codetag_type *cttype,
>                                     struct codetag_module *cmod)
>  {
> @@ -175,8 +258,21 @@ static bool alloc_tag_module_unload(struct codetag_type *cttype,
>
>                 if (WARN(counter.bytes,
>                          "%s:%u module %s func:%s has %llu allocated at module unload",
> -                        ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
> +                        ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) {
>                         module_unused = false;
> +               }
> +#ifdef CONFIG_SLAB_PER_SITE
> +               else if (tag->meta.sized) {
> +                       /* Remove the allocated caches, if possible. */
> +                       void *p = READ_ONCE(tag->meta.cache);
> +
> +                       WRITE_ONCE(tag->meta.cache, NULL);

I'm guessing you are not using try_cmpxchg() the same way you did in
alloc_tag_site_init() because a race with any other user is impossible
at the module unload time? If so, a comment mentioning that would be
good.

> +                       if (tag->meta.sized < SIZE_MAX)
> +                               kmem_cache_destroy(p);
> +                       else
> +                               kmem_buckets_destroy(p);
> +               }
> +#endif
>         }
>
>         return module_unused;
> @@ -260,15 +356,16 @@ static void __init sysctl_init(void)
>  static inline void sysctl_init(void) {}
>  #endif /* CONFIG_SYSCTL */
>
> +static const struct codetag_type_desc alloc_tag_desc = {
> +       .section        = "alloc_tags",
> +       .tag_size       = sizeof(struct alloc_tag),
> +       .module_load    = alloc_tag_module_load,
> +       .module_unload  = alloc_tag_module_unload,
> +};
> +
>  static int __init alloc_tag_init(void)
>  {
> -       const struct codetag_type_desc desc = {
> -               .section        = "alloc_tags",
> -               .tag_size       = sizeof(struct alloc_tag),
> -               .module_unload  = alloc_tag_module_unload,
> -       };
> -
> -       alloc_tag_cttype = codetag_register_type(&desc);
> +       alloc_tag_cttype = codetag_register_type(&alloc_tag_desc);
>         if (IS_ERR(alloc_tag_cttype))
>                 return PTR_ERR(alloc_tag_cttype);
>
> @@ -278,3 +375,11 @@ static int __init alloc_tag_init(void)
>         return 0;
>  }
>  module_init(alloc_tag_init);
> +
> +#ifdef CONFIG_SLAB_PER_SITE
> +void alloc_tag_early_walk(void)
> +{
> +       codetag_early_walk(&alloc_tag_desc, alloc_tag_site_init_early);
> +       ondemand_ready = true;
> +}
> +#endif
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 855c63c3270d..4f01cb6dd32e 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -302,7 +302,20 @@ config SLAB_PER_SITE
>         default SLAB_FREELIST_HARDENED
>         select SLAB_BUCKETS
>         help
> -         Track sizes of kmalloc() call sites.
> +         As a defense against shared-cache "type confusion" use-after-free
> +         attacks, every kmalloc()-family call allocates from a separate
> +         kmem_cache (or when dynamically sized, kmem_buckets). Attackers
> +         will no longer be able to groom malicious objects via similarly
> +         sized allocations that share the same cache as the target object.
> +
> +         This increases the "at rest" kmalloc slab memory usage by
> +         roughly 5x (around 7MiB), and adds the potential for greater
> +         long-term memory fragmentation. However, some workloads
> +         actually see performance improvements when single allocation
> +         sites are hot.

I hope you provide the performance and overhead data in the cover
letter when you post v1.

> +
> +         For a similar defense, see CONFIG_RANDOM_KMALLOC_CACHES, which
> +         has less memory usage overhead, but is probabilistic.
>
>  config SLUB_STATS
>         default n
> @@ -331,6 +344,7 @@ config SLUB_CPU_PARTIAL
>  config RANDOM_KMALLOC_CACHES
>         default n
>         depends on !SLUB_TINY
> +       depends on !SLAB_PER_SITE
>         bool "Randomize slab caches for normal kmalloc"
>         help
>           A hardening feature that creates multiple copies of slab caches for
> @@ -345,6 +359,9 @@ config RANDOM_KMALLOC_CACHES
>           limited degree of memory and CPU overhead that relates to hardware and
>           system workload.
>
> +         For a similar defense, see CONFIG_SLAB_PER_SITE, which is
> +         deterministic, but has greater memory usage overhead.
> +
>  endmenu # Slab allocator options
>
>  config SHUFFLE_PAGE_ALLOCATOR
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index fc698cba0ebe..09506bfa972c 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1040,6 +1040,7 @@ void __init create_kmalloc_caches(void)
>                 kmem_buckets_cache = kmem_cache_create("kmalloc_buckets",
>                                                        sizeof(kmem_buckets),
>                                                        0, SLAB_NO_MERGE, NULL);
> +       alloc_tag_early_walk();
>  }
>
>  /**
> diff --git a/mm/slub.c b/mm/slub.c
> index 3520acaf9afa..d14102c4b4d7 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4135,6 +4135,35 @@ void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
>  }
>  EXPORT_SYMBOL(__kmalloc_large_node_noprof);
>
> +static __always_inline
> +struct kmem_cache *choose_slab(size_t size, kmem_buckets *b, gfp_t flags,
> +                              unsigned long caller)
> +{
> +#ifdef CONFIG_SLAB_PER_SITE
> +       struct alloc_tag *tag = current->alloc_tag;
> +
> +       if (!b && tag && tag->meta.sized &&
> +           kmalloc_type(flags, caller) == KMALLOC_NORMAL &&
> +           (flags & GFP_ATOMIC) != GFP_ATOMIC) {

What if allocation is GFP_ATOMIC but a previous allocation from the
same location (same tag) happened without GFP_ATOMIC and
tag->meta.cache was allocated. Why not use that existing cache?
Same if the tag->meta.cache was pre-allocated.

> +               void *p = READ_ONCE(tag->meta.cache);
> +
> +               if (!p && slab_state >= UP) {
> +                       alloc_tag_site_init(&tag->ct, true);
> +                       p = READ_ONCE(tag->meta.cache);
> +               }
> +
> +               if (tag->meta.sized < SIZE_MAX) {
> +                       if (p)
> +                               return p;
> +                       /* Otherwise continue with default buckets. */
> +               } else {
> +                       b = p;
> +               }
> +       }
> +#endif
> +       return kmalloc_slab(size, b, flags, caller);
> +}
> +
>  static __always_inline
>  void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
>                         unsigned long caller)
> @@ -4152,7 +4181,7 @@ void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
>         if (unlikely(!size))
>                 return ZERO_SIZE_PTR;
>
> -       s = kmalloc_slab(size, b, flags, caller);
> +       s = choose_slab(size, b, flags, caller);
>
>         ret = slab_alloc_node(s, NULL, flags, node, caller, size);
>         ret = kasan_kmalloc(s, ret, size, flags);
> --
> 2.34.1
>