On Mon, 31 Aug 2020 at 11:04, Jakub Sitnicki <jakub@xxxxxxxxxxxxxx> wrote: > > On Fri, Aug 28, 2020 at 11:48 AM CEST, Lorenz Bauer wrote: > > Add bpf_iter support for sockmap / sockhash, based on the bpf_sk_storage and > > hashtable implementation. sockmap and sockhash share the same iteration > > context: a pointer to an arbitrary key and a pointer to a socket. Both > > pointers may be NULL, and so BPF has to perform a NULL check before accessing > > them. Technically it's not possible for sockhash iteration to yield a NULL > > socket, but we ignore this to be able to use a single iteration point. > > > > Iteration will visit all keys that remain unmodified during the lifetime of > > the iterator. It may or may not visit newly added ones. > > > > Signed-off-by: Lorenz Bauer <lmb@xxxxxxxxxxxxxx> > > --- > > net/core/sock_map.c | 283 ++++++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 283 insertions(+) > > > > diff --git a/net/core/sock_map.c b/net/core/sock_map.c > > index d6c6e1e312fc..31c4332f06e4 100644 > > --- a/net/core/sock_map.c > > +++ b/net/core/sock_map.c > > @@ -703,6 +703,116 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = { > > .arg4_type = ARG_ANYTHING, > > }; > > > > +struct sock_map_seq_info { > > + struct bpf_map *map; > > + struct sock *sk; > > + u32 index; > > +}; > > + > > +struct bpf_iter__sockmap { > > + __bpf_md_ptr(struct bpf_iter_meta *, meta); > > + __bpf_md_ptr(struct bpf_map *, map); > > + __bpf_md_ptr(void *, key); > > + __bpf_md_ptr(struct bpf_sock *, sk); > > +}; > > + > > +DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, > > + struct bpf_map *map, void *key, > > + struct sock *sk) > > + > > +static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) > > +{ > > + if (unlikely(info->index >= info->map->max_entries)) > > + return NULL; > > + > > + info->sk = __sock_map_lookup_elem(info->map, info->index); > > + if (!info->sk || !sk_fullsock(info->sk)) > > As we've talked off-line, we don't expect neither timewait nor request > sockets in sockmap so sk_fullsock() check is likely not needed. Ack. > > > + info->sk = NULL; > > + > > + /* continue iterating */ > > + return info; > > +} > > + > > +static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) > > +{ > > + struct sock_map_seq_info *info = seq->private; > > + > > + if (*pos == 0) > > + ++*pos; > > + > > + /* pairs with sock_map_seq_stop */ > > + rcu_read_lock(); > > + return sock_map_seq_lookup_elem(info); > > +} > > + > > +static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) > > +{ > > + struct sock_map_seq_info *info = seq->private; > > + > > + ++*pos; > > + ++info->index; > > + > > + return sock_map_seq_lookup_elem(info); > > +} > > + > > +static int __sock_map_seq_show(struct seq_file *seq, void *v) > > +{ > > + struct sock_map_seq_info *info = seq->private; > > + struct bpf_iter__sockmap ctx = {}; > > + struct bpf_iter_meta meta; > > + struct bpf_prog *prog; > > + > > + meta.seq = seq; > > + prog = bpf_iter_get_info(&meta, !v); > > + if (!prog) > > + return 0; > > + > > + ctx.meta = &meta; > > + ctx.map = info->map; > > + if (v) { > > + ctx.key = &info->index; > > + ctx.sk = (struct bpf_sock *)info->sk; > > + } > > + > > + return bpf_iter_run_prog(prog, &ctx); > > +} > > + > > +static int sock_map_seq_show(struct seq_file *seq, void *v) > > +{ > > + return __sock_map_seq_show(seq, v); > > +} > > + > > +static void sock_map_seq_stop(struct seq_file *seq, void *v) > > +{ > > + if (!v) > > + (void)__sock_map_seq_show(seq, NULL); > > + > > + /* pairs with sock_map_seq_start */ > > + rcu_read_unlock(); > > +} > > + > > +static const struct seq_operations sock_map_seq_ops = { > > + .start = sock_map_seq_start, > > + .next = sock_map_seq_next, > > + .stop = sock_map_seq_stop, > > + .show = sock_map_seq_show, > > +}; > > + > > +static int sock_map_init_seq_private(void *priv_data, > > + struct bpf_iter_aux_info *aux) > > +{ > > + struct sock_map_seq_info *info = priv_data; > > + > > + info->map = aux->map; > > + return 0; > > +} > > + > > +static const struct bpf_iter_seq_info sock_map_iter_seq_info = { > > + .seq_ops = &sock_map_seq_ops, > > + .init_seq_private = sock_map_init_seq_private, > > + .seq_priv_size = sizeof(struct sock_map_seq_info), > > +}; > > + > > static int sock_map_btf_id; > > const struct bpf_map_ops sock_map_ops = { > > .map_alloc = sock_map_alloc, > > [...] > > > @@ -1198,6 +1309,120 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = { > > .arg4_type = ARG_ANYTHING, > > }; > > > > +struct sock_hash_seq_info { > > + struct bpf_map *map; > > + struct bpf_shtab *htab; > > + u32 bucket_id; > > +}; > > + > > +static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, > > + struct bpf_shtab_elem *prev_elem) > > +{ > > + const struct bpf_shtab *htab = info->htab; > > + struct bpf_shtab_bucket *bucket; > > + struct bpf_shtab_elem *elem; > > + struct hlist_node *node; > > + > > + /* try to find next elem in the same bucket */ > > + if (prev_elem) { > > + node = rcu_dereference_raw(hlist_next_rcu(&prev_elem->node)); > > I'm not convinced we need to go for the rcu_dereference_raw() > variant. Access happens inside read-side critical section, which we > entered with rcu_read_lock() in sock_hash_seq_start(). > > That's typical and rcu_dereference() seems appropriate. Basing this on > what I read in Documentation/RCU/rcu_dereference.rst. Yeah, that makes sense to me. However, sock_hash_get_next_key also uses rcu_dereference_raw. John, can you shed some light on why that is? Can we replace that with plain rcu_dereference as well? > > > + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); > > + if (elem) > > + return elem; > > + > > + /* no more elements, continue in the next bucket */ > > + info->bucket_id++; > > + } > > + > > + for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { > > + bucket = &htab->buckets[info->bucket_id]; > > + node = rcu_dereference_raw(hlist_first_rcu(&bucket->head)); > > + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); > > + if (elem) > > + return elem; > > + } > > + > > + return NULL; > > +} > > + > > +static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) > > +{ > > + struct sock_hash_seq_info *info = seq->private; > > + > > + if (*pos == 0) > > + ++*pos; > > + > > + /* pairs with sock_hash_seq_stop */ > > + rcu_read_lock(); > > + return sock_hash_seq_find_next(info, NULL); > > +} > > + > > +static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) > > +{ > > + struct sock_hash_seq_info *info = seq->private; > > + > > + ++*pos; > > + return sock_hash_seq_find_next(info, v); > > +} > > + > > +static int __sock_hash_seq_show(struct seq_file *seq, struct bpf_shtab_elem *elem) > > +{ > > + struct sock_hash_seq_info *info = seq->private; > > + struct bpf_iter__sockmap ctx = {}; > > + struct bpf_iter_meta meta; > > + struct bpf_prog *prog; > > + > > + meta.seq = seq; > > + prog = bpf_iter_get_info(&meta, !elem); > > + if (!prog) > > + return 0; > > + > > + ctx.meta = &meta; > > + ctx.map = info->map; > > + if (elem) { > > + ctx.key = elem->key; > > + ctx.sk = (struct bpf_sock *)elem->sk; > > + } > > + > > + return bpf_iter_run_prog(prog, &ctx); > > +} > > + > > +static int sock_hash_seq_show(struct seq_file *seq, void *v) > > +{ > > + return __sock_hash_seq_show(seq, v); > > +} > > + > > +static void sock_hash_seq_stop(struct seq_file *seq, void *v) > > +{ > > + if (!v) > > + (void)__sock_hash_seq_show(seq, NULL); > > + > > + /* pairs with sock_hash_seq_start */ > > + rcu_read_unlock(); > > +} > > + > > +static const struct seq_operations sock_hash_seq_ops = { > > + .start = sock_hash_seq_start, > > + .next = sock_hash_seq_next, > > + .stop = sock_hash_seq_stop, > > + .show = sock_hash_seq_show, > > +}; > > + > > +static int sock_hash_init_seq_private(void *priv_data, > > + struct bpf_iter_aux_info *aux) > > +{ > > + struct sock_hash_seq_info *info = priv_data; > > + > > + info->map = aux->map; > > + return 0; > > +} > > + > > +static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { > > + .seq_ops = &sock_hash_seq_ops, > > + .init_seq_private = sock_hash_init_seq_private, > > + .seq_priv_size = sizeof(struct sock_hash_seq_info), > > +}; > > + > > static int sock_hash_map_btf_id; > > const struct bpf_map_ops sock_hash_ops = { > > .map_alloc = sock_hash_alloc, > > [...] -- Lorenz Bauer | Systems Engineer 6th Floor, County Hall/The Riverside Building, SE1 7PB, UK www.cloudflare.com