On Mon, Apr 27, 2020 at 1:19 PM Yonghong Song <yhs@xxxxxx> wrote: > > A new bpf command BPF_ITER_CREATE is added. > > The anonymous bpf iterator is seq_file based. > The seq_file private data are referenced by targets. > The bpf_iter infrastructure allocated additional space > at seq_file->private after the space used by targets > to store some meta data, e.g., > prog: prog to run > session_id: an unique id for each opened seq_file > seq_num: how many times bpf programs are queried in this session > has_last: indicate whether or not bpf_prog has been called after > all valid objects have been processed > > A map between file and prog/link is established to help > fops->release(). When fops->release() is called, just based on > inode and file, bpf program cannot be located since target > seq_priv_size not available. This map helps retrieve the prog > whose reference count needs to be decremented. > > Signed-off-by: Yonghong Song <yhs@xxxxxx> > --- > include/linux/bpf.h | 3 + > include/uapi/linux/bpf.h | 6 ++ > kernel/bpf/bpf_iter.c | 162 ++++++++++++++++++++++++++++++++- > kernel/bpf/syscall.c | 27 ++++++ > tools/include/uapi/linux/bpf.h | 6 ++ > 5 files changed, 203 insertions(+), 1 deletion(-) > [...] > int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) > { > struct bpf_iter_target_info *tinfo; > @@ -37,6 +95,8 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) > INIT_LIST_HEAD(&targets); > mutex_init(&targets_mutex); > mutex_init(&bpf_iter_mutex); > + INIT_LIST_HEAD(&anon_iter_info); > + mutex_init(&anon_iter_info_mutex); > bpf_iter_inited = true; > } > > @@ -61,7 +121,20 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) > struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size, > u64 *session_id, u64 *seq_num, bool is_last) instead of passing many pointers (session_id, seq_num), would it be better to just pass bpf_iter_meta * instead? > { > - return NULL; > + struct extra_priv_data *extra_data; > + > + if (seq->file->f_op != &anon_bpf_iter_fops) > + return NULL; > + > + extra_data = get_extra_priv_dptr(seq->private, priv_data_size); > + if (extra_data->has_last) > + return NULL; > + > + *session_id = extra_data->session_id; > + *seq_num = extra_data->seq_num++; > + extra_data->has_last = is_last; > + > + return extra_data->prog; > } > > int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) > @@ -150,3 +223,90 @@ int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, > mutex_unlock(&bpf_iter_mutex); > return ret; > } > + > +static void init_seq_file(void *priv_data, struct bpf_iter_target_info *tinfo, > + struct bpf_prog *prog) > +{ > + struct extra_priv_data *extra_data; > + > + if (tinfo->target_feature & BPF_DUMP_SEQ_NET_PRIVATE) > + set_seq_net_private((struct seq_net_private *)priv_data, > + current->nsproxy->net_ns); > + > + extra_data = get_extra_priv_dptr(priv_data, tinfo->seq_priv_size); > + extra_data->session_id = atomic64_add_return(1, &session_id); > + extra_data->prog = prog; > + extra_data->seq_num = 0; > + extra_data->has_last = false; > +} > + > +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) > +{ > + struct anon_file_prog_assoc *finfo; > + struct bpf_iter_target_info *tinfo; > + struct bpf_prog *prog; > + u32 total_priv_dsize; > + void *priv_data; > + > + finfo = kmalloc(sizeof(*finfo), GFP_USER | __GFP_NOWARN); > + if (!finfo) > + return -ENOMEM; > + > + mutex_lock(&bpf_iter_mutex); > + prog = link->link.prog; > + bpf_prog_inc(prog); > + mutex_unlock(&bpf_iter_mutex); > + > + tinfo = link->tinfo; > + total_priv_dsize = get_total_priv_dsize(tinfo->seq_priv_size); > + priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); > + if (!priv_data) { > + bpf_prog_sub(prog, 1); > + kfree(finfo); > + return -ENOMEM; > + } > + > + init_seq_file(priv_data, tinfo, prog); > + > + finfo->file = file; > + finfo->prog = prog; > + > + mutex_lock(&anon_iter_info_mutex); > + list_add(&finfo->list, &anon_iter_info); > + mutex_unlock(&anon_iter_info_mutex); > + return 0; > +} > + > +int bpf_iter_new_fd(struct bpf_link *link) > +{ > + struct file *file; > + int err, fd; > + > + if (link->ops != &bpf_iter_link_lops) > + return -EINVAL; > + > + fd = get_unused_fd_flags(O_CLOEXEC); > + if (fd < 0) > + return fd; > + > + file = anon_inode_getfile("bpf_iter", &anon_bpf_iter_fops, > + NULL, O_CLOEXEC); Shouldn't this anon file be readable and have O_RDONLY flag as well? > + if (IS_ERR(file)) { > + err = PTR_ERR(file); > + goto free_fd; > + } > + > + err = prepare_seq_file(file, > + container_of(link, struct bpf_iter_link, link)); > + if (err) > + goto free_file; > + > + fd_install(fd, file); > + return fd; > + > +free_file: > + fput(file); > +free_fd: > + put_unused_fd(fd); > + return err; > +} > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index b7af4f006f2e..458f7000887a 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -3696,6 +3696,30 @@ static int link_update(union bpf_attr *attr) > return ret; > } > > +#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags > + > +static int bpf_iter_create(union bpf_attr *attr) > +{ > + struct bpf_link *link; > + int err; > + > + if (CHECK_ATTR(BPF_ITER_CREATE)) > + return -EINVAL; > + > + if (attr->iter_create.flags) > + return -EINVAL; > + > + link = bpf_link_get_from_fd(attr->iter_create.link_fd); > + if (IS_ERR(link)) > + return PTR_ERR(link); > + > + err = bpf_iter_new_fd(link); > + if (err < 0) > + bpf_link_put(link); bpf_iter_new_fd() doesn't take a refcnt on link, so you need to put it regardless of success or error > + > + return err; > +} > + > SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) > { > union bpf_attr attr; > @@ -3813,6 +3837,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz > case BPF_LINK_UPDATE: > err = link_update(&attr); > break; > + case BPF_ITER_CREATE: > + err = bpf_iter_create(&attr); > + break; > default: > err = -EINVAL; > break; [...]