On Mon, Apr 27, 2020 at 01:12:37PM -0700, Yonghong Song wrote: > The bpf_map iterator is implemented. > The bpf program is called at seq_ops show() and stop() functions. > bpf_iter_get_prog() will retrieve bpf program and other > parameters during seq_file object traversal. In show() function, > bpf program will traverse every valid object, and in stop() > function, bpf program will be called one more time after all > objects are traversed. > > The first member of the bpf context contains the meta data, namely, > the seq_file, session_id and seq_num. Here, the session_id is > a unique id for one specific seq_file session. The seq_num is > the number of bpf prog invocations in the current session. > The bpf_iter_get_prog(), which will be implemented in subsequent > patches, will have more information on how meta data are computed. > > The second member of the bpf context is a struct bpf_map pointer, > which bpf program can examine. > > The target implementation also provided the structure definition > for bpf program and the function definition for verifier to > verify the bpf program. Specifically for bpf_map iterator, > the structure is "bpf_iter__bpf_map" andd the function is > "__bpf_iter__bpf_map". > > More targets will be implemented later, all of which will include > the following, similar to bpf_map iterator: > - seq_ops() implementation > - function definition for verifier to verify the bpf program > - seq_file private data size > - additional target feature > > Signed-off-by: Yonghong Song <yhs@xxxxxx> > --- > include/linux/bpf.h | 10 ++++ > kernel/bpf/Makefile | 2 +- > kernel/bpf/bpf_iter.c | 19 ++++++++ > kernel/bpf/map_iter.c | 107 ++++++++++++++++++++++++++++++++++++++++++ > kernel/bpf/syscall.c | 13 +++++ > 5 files changed, 150 insertions(+), 1 deletion(-) > create mode 100644 kernel/bpf/map_iter.c > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 5e56abc1e2f1..4ac8d61f7c3e 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1078,6 +1078,7 @@ int generic_map_update_batch(struct bpf_map *map, > int generic_map_delete_batch(struct bpf_map *map, > const union bpf_attr *attr, > union bpf_attr __user *uattr); > +struct bpf_map *bpf_map_get_curr_or_next(u32 *id); > > extern int sysctl_unprivileged_bpf_disabled; > > @@ -1118,7 +1119,16 @@ struct bpf_iter_reg { > u32 target_feature; > }; > > +struct bpf_iter_meta { > + __bpf_md_ptr(struct seq_file *, seq); > + u64 session_id; > + u64 seq_num; > +}; > + > int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); > +struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size, > + u64 *session_id, u64 *seq_num, bool is_last); > +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); > > int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); > int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile > index 6a8b0febd3f6..b2b5eefc5254 100644 > --- a/kernel/bpf/Makefile > +++ b/kernel/bpf/Makefile > @@ -2,7 +2,7 @@ > obj-y := core.o > CFLAGS_core.o += $(call cc-disable-warning, override-init) > > -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o > +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o > obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o > obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o > obj-$(CONFIG_BPF_SYSCALL) += disasm.o > diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c > index 1115b978607a..284c95587803 100644 > --- a/kernel/bpf/bpf_iter.c > +++ b/kernel/bpf/bpf_iter.c > @@ -48,3 +48,22 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) > > return 0; > } > + > +struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size, > + u64 *session_id, u64 *seq_num, bool is_last) > +{ > + return NULL; Can this patch be moved after this function is implemented? > +} > + > +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) > +{ > + int ret; > + > + migrate_disable(); > + rcu_read_lock(); > + ret = BPF_PROG_RUN(prog, ctx); > + rcu_read_unlock(); > + migrate_enable(); > + > + return ret; > +} > diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c > new file mode 100644 > index 000000000000..bb3ad4c3bde5 > --- /dev/null > +++ b/kernel/bpf/map_iter.c > @@ -0,0 +1,107 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* Copyright (c) 2020 Facebook */ > +#include <linux/bpf.h> > +#include <linux/fs.h> > +#include <linux/filter.h> > +#include <linux/kernel.h> > + > +struct bpf_iter_seq_map_info { > + struct bpf_map *map; > + u32 id; > +}; > + > +static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) > +{ > + struct bpf_iter_seq_map_info *info = seq->private; > + struct bpf_map *map; > + u32 id = info->id; > + > + map = bpf_map_get_curr_or_next(&id); > + if (IS_ERR_OR_NULL(map)) > + return NULL; > + > + ++*pos; Does pos always need to be incremented here? > + info->map = map; > + info->id = id; > + return map; > +} > + > +static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) > +{ > + struct bpf_iter_seq_map_info *info = seq->private; > + struct bpf_map *map; > + > + ++*pos; > + ++info->id; > + map = bpf_map_get_curr_or_next(&info->id); > + if (IS_ERR_OR_NULL(map)) > + return NULL; > + > + bpf_map_put(info->map); > + info->map = map; > + return map; > +} > + > +struct bpf_iter__bpf_map { > + __bpf_md_ptr(struct bpf_iter_meta *, meta); > + __bpf_md_ptr(struct bpf_map *, map); > +}; > + > +int __init __bpf_iter__bpf_map(struct bpf_iter_meta *meta, struct bpf_map *map) > +{ > + return 0; > +} > + > +static int bpf_map_seq_show(struct seq_file *seq, void *v) > +{ > + struct bpf_iter_meta meta; > + struct bpf_iter__bpf_map ctx; > + struct bpf_prog *prog; > + int ret = 0; > + > + ctx.meta = &meta; > + ctx.map = v; > + meta.seq = seq; > + prog = bpf_iter_get_prog(seq, sizeof(struct bpf_iter_seq_map_info), > + &meta.session_id, &meta.seq_num, > + v == (void *)0); >From looking at seq_file.c, when will show() be called with "v == NULL"? > + if (prog) > + ret = bpf_iter_run_prog(prog, &ctx); > + > + return ret == 0 ? 0 : -EINVAL; The verifier change in patch 4 should have ensured that prog can only return 0? > +} > + > +static void bpf_map_seq_stop(struct seq_file *seq, void *v) > +{ > + struct bpf_iter_seq_map_info *info = seq->private; > + > + if (!v) > + bpf_map_seq_show(seq, v); > + > + if (info->map) { > + bpf_map_put(info->map); > + info->map = NULL; > + } > +} > + > +static const struct seq_operations bpf_map_seq_ops = { > + .start = bpf_map_seq_start, > + .next = bpf_map_seq_next, > + .stop = bpf_map_seq_stop, > + .show = bpf_map_seq_show, > +}; > + > +static int __init bpf_map_iter_init(void) > +{ > + struct bpf_iter_reg reg_info = { > + .target = "bpf_map", > + .target_func_name = "__bpf_iter__bpf_map", > + .seq_ops = &bpf_map_seq_ops, > + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), > + .target_feature = 0, > + }; > + > + return bpf_iter_reg_target(®_info); > +} > + > +late_initcall(bpf_map_iter_init); > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 7626b8024471..022187640943 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -2800,6 +2800,19 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, > return err; > } > > +struct bpf_map *bpf_map_get_curr_or_next(u32 *id) > +{ > + struct bpf_map *map; > + > + spin_lock_bh(&map_idr_lock); > + map = idr_get_next(&map_idr, id); > + if (map) > + map = __bpf_map_inc_not_zero(map, false); nit. For the !map case, set "map = ERR_PTR(-ENOENT)" so that the _OR_NULL() test is not needed. It will be more consistent with other error checking codes in syscall.c. > + spin_unlock_bh(&map_idr_lock); > + > + return map; > +} > + > #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id > > struct bpf_prog *bpf_prog_by_id(u32 id) > -- > 2.24.1 >