Re: [PATCH v3 bpf-next 1/3] bpf: Add kmem_cache iterator

Vlastimil Babka <vbabka@xxxxxxx> · Wed, 2 Oct 2024 12:54:17 +0200

On 10/2/24 08:54, Namhyung Kim wrote:
> The new "kmem_cache" iterator will traverse the list of slab caches
> and call attached BPF programs for each entry.  It should check the
> argument (ctx.s) if it's NULL before using it.
> 
> Now the iteration grabs the slab_mutex only if it traverse the list and
> releases the mutex when it runs the BPF program.  The kmem_cache entry
> is protected by a refcount during the execution.
> 
> It includes the internal "mm/slab.h" header to access kmem_cache,
> slab_caches and slab_mutex.  Hope it's ok to mm folks.
> 
> Signed-off-by: Namhyung Kim <namhyung@xxxxxxxxxx>
> ---
> I've removed the Acked-by's from Roman and Vlastimil since it's changed
> not to hold the slab_mutex and to manage the refcount.  Please review
> this change again!
> 
>  include/linux/btf_ids.h      |   1 +
>  kernel/bpf/Makefile          |   1 +
>  kernel/bpf/kmem_cache_iter.c | 165 +++++++++++++++++++++++++++++++++++
>  3 files changed, 167 insertions(+)
>  create mode 100644 kernel/bpf/kmem_cache_iter.c
> 
> diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
> index c0e3e1426a82f5c4..139bdececdcfaefb 100644
> --- a/include/linux/btf_ids.h
> +++ b/include/linux/btf_ids.h
> @@ -283,5 +283,6 @@ extern u32 btf_tracing_ids[];
>  extern u32 bpf_cgroup_btf_id[];
>  extern u32 bpf_local_storage_map_btf_id[];
>  extern u32 btf_bpf_map_id[];
> +extern u32 bpf_kmem_cache_btf_id[];
>  
>  #endif
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index 9b9c151b5c826b31..105328f0b9c04e37 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -52,3 +52,4 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/
>  obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
>  obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
>  obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
> +obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
> diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c
> new file mode 100644
> index 0000000000000000..a77c08b82c6bc965
> --- /dev/null
> +++ b/kernel/bpf/kmem_cache_iter.c
> @@ -0,0 +1,165 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2024 Google */
> +#include <linux/bpf.h>
> +#include <linux/btf_ids.h>
> +#include <linux/slab.h>
> +#include <linux/kernel.h>
> +#include <linux/seq_file.h>
> +
> +#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */
> +
> +struct bpf_iter__kmem_cache {
> +	__bpf_md_ptr(struct bpf_iter_meta *, meta);
> +	__bpf_md_ptr(struct kmem_cache *, s);
> +};
> +
> +static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	loff_t cnt = 0;
> +	struct kmem_cache *s = NULL;
> +
> +	mutex_lock(&slab_mutex);
> +
> +	/*
> +	 * Find an entry at the given position in the slab_caches list instead
> +	 * of keeping a reference (of the last visited entry, if any) out of
> +	 * slab_mutex. It might miss something if one is deleted in the middle
> +	 * while it releases the lock.  But it should be rare and there's not
> +	 * much we can do about it.
> +	 */
> +	list_for_each_entry(s, &slab_caches, list) {
> +		if (cnt == *pos) {
> +			/*
> +			 * Make sure this entry remains in the list by getting
> +			 * a new reference count.  Note that boot_cache entries
> +			 * have a negative refcount, so don't touch them.
> +			 */
> +			if (s->refcount > 0)
> +				s->refcount++;
> +			break;
> +		}
> +
> +		cnt++;
> +	}
> +	mutex_unlock(&slab_mutex);
> +
> +	if (cnt != *pos)
> +		return NULL;
> +
> +	++*pos;
> +	return s;
> +}
> +
> +static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
> +{
> +	struct bpf_iter_meta meta;
> +	struct bpf_iter__kmem_cache ctx = {
> +		.meta = &meta,
> +		.s = v,
> +	};
> +	struct bpf_prog *prog;
> +	bool destroy = false;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, true);
> +	if (prog)
> +		bpf_iter_run_prog(prog, &ctx);
> +
> +	mutex_lock(&slab_mutex);
> +	if (ctx.s && ctx.s->refcount > 0)
> +		destroy = true;

I'd do the same optimization as in kmem_cache_iter_seq_next() otherwise this
will always results in taking the mutex twice and performing
kvfree_rcu_barrier() needlessly?

> +	mutex_unlock(&slab_mutex);
> +
> +	if (destroy)
> +		kmem_cache_destroy(ctx.s);
> +}
> +
> +static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct kmem_cache *s = v;
> +	struct kmem_cache *next = NULL;
> +	bool destroy = false;
> +
> +	++*pos;
> +
> +	mutex_lock(&slab_mutex);
> +
> +	if (list_last_entry(&slab_caches, struct kmem_cache, list) != s) {
> +		next = list_next_entry(s, list);
> +		if (next->refcount > 0)
> +			next->refcount++;
> +	}
> +
> +	/* Skip kmem_cache_destroy() for active entries */
> +	if (s->refcount > 1)
> +		s->refcount--;
> +	else if (s->refcount == 1)
> +		destroy = true;
> +
> +	mutex_unlock(&slab_mutex);
> +
> +	if (destroy)
> +		kmem_cache_destroy(s);
> +
> +	return next;
> +}
> +
> +static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v)
> +{
> +	struct bpf_iter_meta meta;
> +	struct bpf_iter__kmem_cache ctx = {
> +		.meta = &meta,
> +		.s = v,
> +	};
> +	struct bpf_prog *prog;
> +	int ret = 0;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, false);
> +	if (prog)
> +		ret = bpf_iter_run_prog(prog, &ctx);
> +
> +	return ret;
> +}
> +
> +static const struct seq_operations kmem_cache_iter_seq_ops = {
> +	.start  = kmem_cache_iter_seq_start,
> +	.next   = kmem_cache_iter_seq_next,
> +	.stop   = kmem_cache_iter_seq_stop,
> +	.show   = kmem_cache_iter_seq_show,
> +};
> +
> +BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache)
> +
> +static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = {
> +	.seq_ops		= &kmem_cache_iter_seq_ops,
> +};
> +
> +static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux,
> +					    struct seq_file *seq)
> +{
> +	seq_puts(seq, "kmem_cache iter\n");
> +}
> +
> +DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta,
> +		     struct kmem_cache *s)
> +
> +static struct bpf_iter_reg bpf_kmem_cache_reg_info = {
> +	.target			= "kmem_cache",
> +	.feature		= BPF_ITER_RESCHED,
> +	.show_fdinfo		= bpf_iter_kmem_cache_show_fdinfo,
> +	.ctx_arg_info_size	= 1,
> +	.ctx_arg_info		= {
> +		{ offsetof(struct bpf_iter__kmem_cache, s),
> +		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
> +	},
> +	.seq_info		= &kmem_cache_iter_seq_info,
> +};
> +
> +static int __init bpf_kmem_cache_iter_init(void)
> +{
> +	bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0];
> +	return bpf_iter_reg_target(&bpf_kmem_cache_reg_info);
> +}
> +
> +late_initcall(bpf_kmem_cache_iter_init);
> 
> base-commit: 9502a7de5a61bec3bda841a830560c5d6d40ecac