On Mon, Apr 27, 2020 at 01:12:42PM -0700, Yonghong Song wrote: > A new bpf command BPF_ITER_CREATE is added. > > The anonymous bpf iterator is seq_file based. > The seq_file private data are referenced by targets. > The bpf_iter infrastructure allocated additional space > at seq_file->private after the space used by targets > to store some meta data, e.g., > prog: prog to run > session_id: an unique id for each opened seq_file > seq_num: how many times bpf programs are queried in this session > has_last: indicate whether or not bpf_prog has been called after > all valid objects have been processed > > A map between file and prog/link is established to help > fops->release(). When fops->release() is called, just based on > inode and file, bpf program cannot be located since target > seq_priv_size not available. This map helps retrieve the prog > whose reference count needs to be decremented. How about putting "struct extra_priv_data" at the beginning of the seq_file's private store instead since its size is known. seq->private can point to an aligned byte after +sizeof(struct extra_priv_data). [ ... ] > diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c > index fc1ce5ee5c3f..1f4e778d1814 100644 > --- a/kernel/bpf/bpf_iter.c > +++ b/kernel/bpf/bpf_iter.c [ ... ] > @@ -26,6 +40,50 @@ static bool bpf_iter_inited = false; > /* protect bpf_iter_link.link->prog upddate */ > static struct mutex bpf_iter_mutex; > > +/* Since at anon seq_file release function, the prog cannot > + * be retrieved since target seq_priv_size is not available. > + * Keep a list of <anon_file, prog> mapping, so that > + * at file release stage, the prog can be released properly. > + */ > +static struct list_head anon_iter_info; > +static struct mutex anon_iter_info_mutex; > + > +/* incremented on every opened seq_file */ > +static atomic64_t session_id; > + > +static u32 get_total_priv_dsize(u32 old_size) > +{ > + return roundup(old_size, 8) + sizeof(struct extra_priv_data); > +} > + > +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size) > +{ > + return old_ptr + roundup(old_size, 8); > +} > + > +static int anon_iter_release(struct inode *inode, struct file *file) > +{ > + struct anon_file_prog_assoc *finfo; > + > + mutex_lock(&anon_iter_info_mutex); > + list_for_each_entry(finfo, &anon_iter_info, list) { > + if (finfo->file == file) { > + bpf_prog_put(finfo->prog); > + list_del(&finfo->list); > + kfree(finfo); > + break; > + } > + } > + mutex_unlock(&anon_iter_info_mutex); > + > + return seq_release_private(inode, file); > +} > + > +static const struct file_operations anon_bpf_iter_fops = { > + .read = seq_read, > + .release = anon_iter_release, > +}; > + > int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) > { > struct bpf_iter_target_info *tinfo; [ ... ] > @@ -150,3 +223,90 @@ int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *old_prog, > mutex_unlock(&bpf_iter_mutex); > return ret; > } > + > +static void init_seq_file(void *priv_data, struct bpf_iter_target_info *tinfo, > + struct bpf_prog *prog) > +{ > + struct extra_priv_data *extra_data; > + > + if (tinfo->target_feature & BPF_DUMP_SEQ_NET_PRIVATE) > + set_seq_net_private((struct seq_net_private *)priv_data, > + current->nsproxy->net_ns); > + > + extra_data = get_extra_priv_dptr(priv_data, tinfo->seq_priv_size); > + extra_data->session_id = atomic64_add_return(1, &session_id); > + extra_data->prog = prog; > + extra_data->seq_num = 0; > + extra_data->has_last = false; > +} > + > +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) > +{ > + struct anon_file_prog_assoc *finfo; > + struct bpf_iter_target_info *tinfo; > + struct bpf_prog *prog; > + u32 total_priv_dsize; > + void *priv_data; > + > + finfo = kmalloc(sizeof(*finfo), GFP_USER | __GFP_NOWARN); > + if (!finfo) > + return -ENOMEM; > + > + mutex_lock(&bpf_iter_mutex); > + prog = link->link.prog; > + bpf_prog_inc(prog); > + mutex_unlock(&bpf_iter_mutex); > + > + tinfo = link->tinfo; > + total_priv_dsize = get_total_priv_dsize(tinfo->seq_priv_size); > + priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); > + if (!priv_data) { > + bpf_prog_sub(prog, 1); Could prog's refcnt go 0 here? If yes, bpf_prog_put() should be used. > + kfree(finfo); > + return -ENOMEM; > + } > + > + init_seq_file(priv_data, tinfo, prog); > + > + finfo->file = file; > + finfo->prog = prog; > + > + mutex_lock(&anon_iter_info_mutex); > + list_add(&finfo->list, &anon_iter_info); > + mutex_unlock(&anon_iter_info_mutex); > + return 0; > +} > +