Re: [PATCH bpf-next 1/3] net: Allow iterating sockmap and sockhash

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, 31 Aug 2020 at 11:04, Jakub Sitnicki <jakub@xxxxxxxxxxxxxx> wrote:
>
> On Fri, Aug 28, 2020 at 11:48 AM CEST, Lorenz Bauer wrote:
> > Add bpf_iter support for sockmap / sockhash, based on the bpf_sk_storage and
> > hashtable implementation. sockmap and sockhash share the same iteration
> > context: a pointer to an arbitrary key and a pointer to a socket. Both
> > pointers may be NULL, and so BPF has to perform a NULL check before accessing
> > them. Technically it's not possible for sockhash iteration to yield a NULL
> > socket, but we ignore this to be able to use a single iteration point.
> >
> > Iteration will visit all keys that remain unmodified during the lifetime of
> > the iterator. It may or may not visit newly added ones.
> >
> > Signed-off-by: Lorenz Bauer <lmb@xxxxxxxxxxxxxx>
> > ---
> >  net/core/sock_map.c | 283 ++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 283 insertions(+)
> >
> > diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> > index d6c6e1e312fc..31c4332f06e4 100644
> > --- a/net/core/sock_map.c
> > +++ b/net/core/sock_map.c
> > @@ -703,6 +703,116 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = {
> >       .arg4_type      = ARG_ANYTHING,
> >  };
> >
> > +struct sock_map_seq_info {
> > +     struct bpf_map *map;
> > +     struct sock *sk;
> > +     u32 index;
> > +};
> > +
> > +struct bpf_iter__sockmap {
> > +     __bpf_md_ptr(struct bpf_iter_meta *, meta);
> > +     __bpf_md_ptr(struct bpf_map *, map);
> > +     __bpf_md_ptr(void *, key);
> > +     __bpf_md_ptr(struct bpf_sock *, sk);
> > +};
> > +
> > +DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta,
> > +                  struct bpf_map *map, void *key,
> > +                  struct sock *sk)
> > +
> > +static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info)
> > +{
> > +     if (unlikely(info->index >= info->map->max_entries))
> > +             return NULL;
> > +
> > +     info->sk = __sock_map_lookup_elem(info->map, info->index);
> > +     if (!info->sk || !sk_fullsock(info->sk))
>
> As we've talked off-line, we don't expect neither timewait nor request
> sockets in sockmap so sk_fullsock() check is likely not needed.

Ack.

>
> > +             info->sk = NULL;
> > +
> > +     /* continue iterating */
> > +     return info;
> > +}
> > +
> > +static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos)
> > +{
> > +     struct sock_map_seq_info *info = seq->private;
> > +
> > +     if (*pos == 0)
> > +             ++*pos;
> > +
> > +     /* pairs with sock_map_seq_stop */
> > +     rcu_read_lock();
> > +     return sock_map_seq_lookup_elem(info);
> > +}
> > +
> > +static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> > +{
> > +     struct sock_map_seq_info *info = seq->private;
> > +
> > +     ++*pos;
> > +     ++info->index;
> > +
> > +     return sock_map_seq_lookup_elem(info);
> > +}
> > +
> > +static int __sock_map_seq_show(struct seq_file *seq, void *v)
> > +{
> > +     struct sock_map_seq_info *info = seq->private;
> > +     struct bpf_iter__sockmap ctx = {};
> > +     struct bpf_iter_meta meta;
> > +     struct bpf_prog *prog;
> > +
> > +     meta.seq = seq;
> > +     prog = bpf_iter_get_info(&meta, !v);
> > +     if (!prog)
> > +             return 0;
> > +
> > +     ctx.meta = &meta;
> > +     ctx.map = info->map;
> > +     if (v) {
> > +             ctx.key = &info->index;
> > +             ctx.sk = (struct bpf_sock *)info->sk;
> > +     }
> > +
> > +     return bpf_iter_run_prog(prog, &ctx);
> > +}
> > +
> > +static int sock_map_seq_show(struct seq_file *seq, void *v)
> > +{
> > +     return __sock_map_seq_show(seq, v);
> > +}
> > +
> > +static void sock_map_seq_stop(struct seq_file *seq, void *v)
> > +{
> > +     if (!v)
> > +             (void)__sock_map_seq_show(seq, NULL);
> > +
> > +     /* pairs with sock_map_seq_start */
> > +     rcu_read_unlock();
> > +}
> > +
> > +static const struct seq_operations sock_map_seq_ops = {
> > +     .start  = sock_map_seq_start,
> > +     .next   = sock_map_seq_next,
> > +     .stop   = sock_map_seq_stop,
> > +     .show   = sock_map_seq_show,
> > +};
> > +
> > +static int sock_map_init_seq_private(void *priv_data,
> > +                                  struct bpf_iter_aux_info *aux)
> > +{
> > +     struct sock_map_seq_info *info = priv_data;
> > +
> > +     info->map = aux->map;
> > +     return 0;
> > +}
> > +
> > +static const struct bpf_iter_seq_info sock_map_iter_seq_info = {
> > +     .seq_ops                = &sock_map_seq_ops,
> > +     .init_seq_private       = sock_map_init_seq_private,
> > +     .seq_priv_size          = sizeof(struct sock_map_seq_info),
> > +};
> > +
> >  static int sock_map_btf_id;
> >  const struct bpf_map_ops sock_map_ops = {
> >       .map_alloc              = sock_map_alloc,
>
> [...]
>
> > @@ -1198,6 +1309,120 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
> >       .arg4_type      = ARG_ANYTHING,
> >  };
> >
> > +struct sock_hash_seq_info {
> > +     struct bpf_map *map;
> > +     struct bpf_shtab *htab;
> > +     u32 bucket_id;
> > +};
> > +
> > +static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info,
> > +                                  struct bpf_shtab_elem *prev_elem)
> > +{
> > +     const struct bpf_shtab *htab = info->htab;
> > +     struct bpf_shtab_bucket *bucket;
> > +     struct bpf_shtab_elem *elem;
> > +     struct hlist_node *node;
> > +
> > +     /* try to find next elem in the same bucket */
> > +     if (prev_elem) {
> > +             node = rcu_dereference_raw(hlist_next_rcu(&prev_elem->node));
>
> I'm not convinced we need to go for the rcu_dereference_raw()
> variant. Access happens inside read-side critical section, which we
> entered with rcu_read_lock() in sock_hash_seq_start().
>
> That's typical and rcu_dereference() seems appropriate. Basing this on
> what I read in Documentation/RCU/rcu_dereference.rst.

Yeah, that makes sense to me. However, sock_hash_get_next_key also
uses rcu_dereference_raw. John, can you shed some light on why that
is? Can we replace that with plain rcu_dereference as well?

>
> > +             elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
> > +             if (elem)
> > +                     return elem;
> > +
> > +             /* no more elements, continue in the next bucket */
> > +             info->bucket_id++;
> > +     }
> > +
> > +     for (; info->bucket_id < htab->buckets_num; info->bucket_id++) {
> > +             bucket = &htab->buckets[info->bucket_id];
> > +             node = rcu_dereference_raw(hlist_first_rcu(&bucket->head));
> > +             elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
> > +             if (elem)
> > +                     return elem;
> > +     }
> > +
> > +     return NULL;
> > +}
> > +
> > +static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos)
> > +{
> > +     struct sock_hash_seq_info *info = seq->private;
> > +
> > +     if (*pos == 0)
> > +             ++*pos;
> > +
> > +     /* pairs with sock_hash_seq_stop */
> > +     rcu_read_lock();
> > +     return sock_hash_seq_find_next(info, NULL);
> > +}
> > +
> > +static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> > +{
> > +     struct sock_hash_seq_info *info = seq->private;
> > +
> > +     ++*pos;
> > +     return sock_hash_seq_find_next(info, v);
> > +}
> > +
> > +static int __sock_hash_seq_show(struct seq_file *seq, struct bpf_shtab_elem *elem)
> > +{
> > +     struct sock_hash_seq_info *info = seq->private;
> > +     struct bpf_iter__sockmap ctx = {};
> > +     struct bpf_iter_meta meta;
> > +     struct bpf_prog *prog;
> > +
> > +     meta.seq = seq;
> > +     prog = bpf_iter_get_info(&meta, !elem);
> > +     if (!prog)
> > +             return 0;
> > +
> > +     ctx.meta = &meta;
> > +     ctx.map = info->map;
> > +     if (elem) {
> > +             ctx.key = elem->key;
> > +             ctx.sk = (struct bpf_sock *)elem->sk;
> > +     }
> > +
> > +     return bpf_iter_run_prog(prog, &ctx);
> > +}
> > +
> > +static int sock_hash_seq_show(struct seq_file *seq, void *v)
> > +{
> > +     return __sock_hash_seq_show(seq, v);
> > +}
> > +
> > +static void sock_hash_seq_stop(struct seq_file *seq, void *v)
> > +{
> > +     if (!v)
> > +             (void)__sock_hash_seq_show(seq, NULL);
> > +
> > +     /* pairs with sock_hash_seq_start */
> > +     rcu_read_unlock();
> > +}
> > +
> > +static const struct seq_operations sock_hash_seq_ops = {
> > +     .start  = sock_hash_seq_start,
> > +     .next   = sock_hash_seq_next,
> > +     .stop   = sock_hash_seq_stop,
> > +     .show   = sock_hash_seq_show,
> > +};
> > +
> > +static int sock_hash_init_seq_private(void *priv_data,
> > +                                  struct bpf_iter_aux_info *aux)
> > +{
> > +     struct sock_hash_seq_info *info = priv_data;
> > +
> > +     info->map = aux->map;
> > +     return 0;
> > +}
> > +
> > +static const struct bpf_iter_seq_info sock_hash_iter_seq_info = {
> > +     .seq_ops                = &sock_hash_seq_ops,
> > +     .init_seq_private       = sock_hash_init_seq_private,
> > +     .seq_priv_size          = sizeof(struct sock_hash_seq_info),
> > +};
> > +
> >  static int sock_hash_map_btf_id;
> >  const struct bpf_map_ops sock_hash_ops = {
> >       .map_alloc              = sock_hash_alloc,
>
> [...]



-- 
Lorenz Bauer  |  Systems Engineer
6th Floor, County Hall/The Riverside Building, SE1 7PB, UK

www.cloudflare.com



[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux