On 10/2/24 08:54, Namhyung Kim wrote: > The new "kmem_cache" iterator will traverse the list of slab caches > and call attached BPF programs for each entry. It should check the > argument (ctx.s) if it's NULL before using it. > > Now the iteration grabs the slab_mutex only if it traverse the list and > releases the mutex when it runs the BPF program. The kmem_cache entry > is protected by a refcount during the execution. > > It includes the internal "mm/slab.h" header to access kmem_cache, > slab_caches and slab_mutex. Hope it's ok to mm folks. > > Signed-off-by: Namhyung Kim <namhyung@xxxxxxxxxx> > --- > I've removed the Acked-by's from Roman and Vlastimil since it's changed > not to hold the slab_mutex and to manage the refcount. Please review > this change again! > > include/linux/btf_ids.h | 1 + > kernel/bpf/Makefile | 1 + > kernel/bpf/kmem_cache_iter.c | 165 +++++++++++++++++++++++++++++++++++ > 3 files changed, 167 insertions(+) > create mode 100644 kernel/bpf/kmem_cache_iter.c > > diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h > index c0e3e1426a82f5c4..139bdececdcfaefb 100644 > --- a/include/linux/btf_ids.h > +++ b/include/linux/btf_ids.h > @@ -283,5 +283,6 @@ extern u32 btf_tracing_ids[]; > extern u32 bpf_cgroup_btf_id[]; > extern u32 bpf_local_storage_map_btf_id[]; > extern u32 btf_bpf_map_id[]; > +extern u32 bpf_kmem_cache_btf_id[]; > > #endif > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile > index 9b9c151b5c826b31..105328f0b9c04e37 100644 > --- a/kernel/bpf/Makefile > +++ b/kernel/bpf/Makefile > @@ -52,3 +52,4 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/ > obj-$(CONFIG_BPF_SYSCALL) += relo_core.o > obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o > obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o > +obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o > diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c > new file mode 100644 > index 0000000000000000..a77c08b82c6bc965 > --- /dev/null > +++ b/kernel/bpf/kmem_cache_iter.c > @@ -0,0 +1,165 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* Copyright (c) 2024 Google */ > +#include <linux/bpf.h> > +#include <linux/btf_ids.h> > +#include <linux/slab.h> > +#include <linux/kernel.h> > +#include <linux/seq_file.h> > + > +#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */ > + > +struct bpf_iter__kmem_cache { > + __bpf_md_ptr(struct bpf_iter_meta *, meta); > + __bpf_md_ptr(struct kmem_cache *, s); > +}; > + > +static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos) > +{ > + loff_t cnt = 0; > + struct kmem_cache *s = NULL; > + > + mutex_lock(&slab_mutex); > + > + /* > + * Find an entry at the given position in the slab_caches list instead > + * of keeping a reference (of the last visited entry, if any) out of > + * slab_mutex. It might miss something if one is deleted in the middle > + * while it releases the lock. But it should be rare and there's not > + * much we can do about it. > + */ > + list_for_each_entry(s, &slab_caches, list) { > + if (cnt == *pos) { > + /* > + * Make sure this entry remains in the list by getting > + * a new reference count. Note that boot_cache entries > + * have a negative refcount, so don't touch them. > + */ > + if (s->refcount > 0) > + s->refcount++; > + break; > + } > + > + cnt++; > + } > + mutex_unlock(&slab_mutex); > + > + if (cnt != *pos) > + return NULL; > + > + ++*pos; > + return s; > +} > + > +static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v) > +{ > + struct bpf_iter_meta meta; > + struct bpf_iter__kmem_cache ctx = { > + .meta = &meta, > + .s = v, > + }; > + struct bpf_prog *prog; > + bool destroy = false; > + > + meta.seq = seq; > + prog = bpf_iter_get_info(&meta, true); > + if (prog) > + bpf_iter_run_prog(prog, &ctx); > + > + mutex_lock(&slab_mutex); > + if (ctx.s && ctx.s->refcount > 0) > + destroy = true; I'd do the same optimization as in kmem_cache_iter_seq_next() otherwise this will always results in taking the mutex twice and performing kvfree_rcu_barrier() needlessly? > + mutex_unlock(&slab_mutex); > + > + if (destroy) > + kmem_cache_destroy(ctx.s); > +} > + > +static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) > +{ > + struct kmem_cache *s = v; > + struct kmem_cache *next = NULL; > + bool destroy = false; > + > + ++*pos; > + > + mutex_lock(&slab_mutex); > + > + if (list_last_entry(&slab_caches, struct kmem_cache, list) != s) { > + next = list_next_entry(s, list); > + if (next->refcount > 0) > + next->refcount++; > + } > + > + /* Skip kmem_cache_destroy() for active entries */ > + if (s->refcount > 1) > + s->refcount--; > + else if (s->refcount == 1) > + destroy = true; > + > + mutex_unlock(&slab_mutex); > + > + if (destroy) > + kmem_cache_destroy(s); > + > + return next; > +} > + > +static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v) > +{ > + struct bpf_iter_meta meta; > + struct bpf_iter__kmem_cache ctx = { > + .meta = &meta, > + .s = v, > + }; > + struct bpf_prog *prog; > + int ret = 0; > + > + meta.seq = seq; > + prog = bpf_iter_get_info(&meta, false); > + if (prog) > + ret = bpf_iter_run_prog(prog, &ctx); > + > + return ret; > +} > + > +static const struct seq_operations kmem_cache_iter_seq_ops = { > + .start = kmem_cache_iter_seq_start, > + .next = kmem_cache_iter_seq_next, > + .stop = kmem_cache_iter_seq_stop, > + .show = kmem_cache_iter_seq_show, > +}; > + > +BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache) > + > +static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = { > + .seq_ops = &kmem_cache_iter_seq_ops, > +}; > + > +static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux, > + struct seq_file *seq) > +{ > + seq_puts(seq, "kmem_cache iter\n"); > +} > + > +DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta, > + struct kmem_cache *s) > + > +static struct bpf_iter_reg bpf_kmem_cache_reg_info = { > + .target = "kmem_cache", > + .feature = BPF_ITER_RESCHED, > + .show_fdinfo = bpf_iter_kmem_cache_show_fdinfo, > + .ctx_arg_info_size = 1, > + .ctx_arg_info = { > + { offsetof(struct bpf_iter__kmem_cache, s), > + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, > + }, > + .seq_info = &kmem_cache_iter_seq_info, > +}; > + > +static int __init bpf_kmem_cache_iter_init(void) > +{ > + bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0]; > + return bpf_iter_reg_target(&bpf_kmem_cache_reg_info); > +} > + > +late_initcall(bpf_kmem_cache_iter_init); > > base-commit: 9502a7de5a61bec3bda841a830560c5d6d40ecac