Re: [RFC PATCH bpf-next v2 1/2] bpf: adding map batch processing support

Brian Vazquez <brianvv.kernel@xxxxxxxxx> · Mon, 23 Sep 2019 16:20:28 -0700

Hi Yonghong, thanks for working on this!

I have some concerns about this implementation but overall I think
this might work for our use case too!

On Sun, Sep 8, 2019 at 1:11 AM Yonghong Song <yhs@xxxxxx> wrote:
>
> Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
> map entries per syscall.
>   https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@xxxxxxxxxxxxxx/T/#t
>
> During discussion, we found more use cases can be supported in a similar
> map operation batching framework. For example, batched map lookup and delete,
> which can be really helpful for bcc.
>   https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
>   https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138
>
> Also, in bcc, we have API to delete all entries in a map.
>   https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264
>
> For map update, batched operations also useful as sometimes applications need
> to populate initial maps with more than one entry. For example, the below
> example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
>   https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550
>
> This patch addresses all the above use cases. To make uapi stable, it also
> covers other potential use cases. For bpf syscall subcommands are introduced:
>         BPF_MAP_LOOKUP_BATCH
>         BPF_MAP_LOOKUP_AND_DELETE_BATCH
>         BPF_MAP_UPDATE_BATCH
>         BPF_MAP_DELETE_BATCH
>
> The UAPI attribute structure looks like:
>
>     struct { /* struct used by BPF_MAP_*_BATCH commands */
>             __u64           batch;  /* input/output:
>                                      * input: start batch,
>                                      *        0 to start from beginning.
>                                      * output: next start batch,
>                                      *         0 to end batching.
>                                      */
>             __aligned_u64   keys;
>             __aligned_u64   values;
>             __u32           count;  /* input/output:
>                                      * input: # of elements keys/values.
>                                      * output: # of filled elements.
>                                      */
>             __u32           map_fd;
>             __u64           elem_flags;
>             __u64           flags;
>     } batch;
>
> An opaque value 'batch' is used for user/kernel space communication
> for where in the map to start the operation for lookup/lookup_and_delete/delete.
>   input 'batch' = 0: to start the operation from the beginning of the map.
>   output 'batch': if not 0, the next input for batch operation.
>
> For lookup/lookup_and_delete:
>   operation: lookup/lookup_and_delete starting from a particular 'batch'.
>   return:
>      'batch'       'count'     return code     meaning
>       0            0           0               Done. Nothing left
>       0            0           -ENOSPC         no space to handle batch 0
>       > 0          0           -ENOSPC         no space to handle 'batch'
>       > 0          > 0         0               stopped right before 'batch'
> Note that:
>   (1). Even if return code is 0 and return 'count' > 0, the return 'count' may
>        not be equal to input 'count'. This happens when there is no enough space
>        to handle a batch.
>   (2). If the return code is an error and not -EFAULT,
>        'batch' indicates the batch has issues and 'count' indicates the number
>        of elements successfully processed.
>
> For delete:
>   operation: deletion starting from a particular 'batch'.
>   return: 0 means everything is deleted from 'batch'.
>           error code means something deletion not happening.
>
> For update:
>   operation: update 'count' number of elements in 'keys'/'values'.
>   return: 0 means successful updates for all elements.
>           error code, if not -EFAULT, 'count' is the number of successful updates.
>
> Signed-off-by: Yonghong Song <yhs@xxxxxx>
> ---
>  include/linux/bpf.h      |   9 ++
>  include/uapi/linux/bpf.h |  22 +++
>  kernel/bpf/hashtab.c     | 324 +++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c     |  68 ++++++++
>  4 files changed, 423 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 5b9d22338606..3c1302e8e2d4 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -37,6 +37,15 @@ struct bpf_map_ops {
>         int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
>         void (*map_release_uref)(struct bpf_map *map);
>         void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
> +       int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
> +                               union bpf_attr __user *uattr);
> +       int (*map_lookup_and_delete_batch)(struct bpf_map *map,
> +                                          const union bpf_attr *attr,
> +                                          union bpf_attr __user *uattr);
> +       int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr,
> +                               union bpf_attr __user *uattr);
> +       int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr,
> +                               union bpf_attr __user *uattr);
>
>         /* funcs callable from userspace and from eBPF programs */
>         void *(*map_lookup_elem)(struct bpf_map *map, void *key);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 5d2fb183ee2d..9d4f76073dd9 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -107,6 +107,10 @@ enum bpf_cmd {
>         BPF_MAP_LOOKUP_AND_DELETE_ELEM,
>         BPF_MAP_FREEZE,
>         BPF_BTF_GET_NEXT_ID,
> +       BPF_MAP_LOOKUP_BATCH,
> +       BPF_MAP_LOOKUP_AND_DELETE_BATCH,
> +       BPF_MAP_UPDATE_BATCH,
> +       BPF_MAP_DELETE_BATCH,
>  };
>
>  enum bpf_map_type {
> @@ -396,6 +400,24 @@ union bpf_attr {
>                 __u64           flags;
>         };
>
> +       struct { /* struct used by BPF_MAP_*_BATCH commands */
> +               __u64           batch;  /* input/output:
> +                                        * input: start batch,
> +                                        *        0 to start from beginning.
> +                                        * output: next start batch,
> +                                        *         0 to end batching.
> +                                        */
> +               __aligned_u64   keys;
> +               __aligned_u64   values;
> +               __u32           count;  /* input/output:
> +                                        * input: # of elements keys/values.
> +                                        * output: # of filled elements.
> +                                        */
> +               __u32           map_fd;
> +               __u64           elem_flags;
> +               __u64           flags;
> +       } batch;
> +
>         struct { /* anonymous struct used by BPF_PROG_LOAD command */
>                 __u32           prog_type;      /* one of enum bpf_prog_type */
>                 __u32           insn_cnt;
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 22066a62c8c9..ee7b90200f4d 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -1232,6 +1232,322 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
>         rcu_read_unlock();
>  }
>
> +static int
> +__htab_map_lookup_and_delete_batch(struct bpf_map *map,
> +                                  const union bpf_attr *attr,
> +                                  union bpf_attr __user *uattr,
> +                                  bool do_delete, bool is_lru_map)
> +{
> +       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> +       u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
> +       void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
> +       u64 elem_map_flags, map_flags;
> +       struct hlist_nulls_head *head;
> +       void __user *ukeys, *uvalues;
> +       struct hlist_nulls_node *n;
> +       u32 batch, max_count;
> +       unsigned long flags;
> +       struct htab_elem *l;
> +       struct bucket *b;
> +       int ret = 0;
> +
> +       max_count = attr->batch.count;
> +       if (!max_count)
> +               return 0;
> +
> +       elem_map_flags = attr->batch.elem_flags;
> +       if ((elem_map_flags & ~BPF_F_LOCK) ||
> +           ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
> +               return -EINVAL;
> +
> +       map_flags = attr->batch.flags;
> +       if (map_flags)
> +               return -EINVAL;
> +
> +       batch = (u32)attr->batch.batch;
> +       if (batch >= htab->n_buckets)
> +               return -EINVAL;
> +
> +       /* We cannot do copy_from_user or copy_to_user inside
> +        * the rcu_read_lock. Allocate enough space here.
> +        */
> +       key_size = htab->map.key_size;
> +       roundup_key_size = round_up(htab->map.key_size, 8);
> +       value_size = htab->map.value_size;
> +       keys = kvmalloc(key_size * max_count, GFP_USER | __GFP_NOWARN);
> +       values = kvmalloc(value_size * max_count, GFP_USER | __GFP_NOWARN);
> +       if (!keys || !values) {
> +               ret = -ENOMEM;
> +               goto out;
> +       }
> +
> +       dst_key = keys;
> +       dst_val = values;
> +       total = 0;
> +
> +       preempt_disable();
> +       this_cpu_inc(bpf_prog_active);
> +       rcu_read_lock();
> +
> +again:
> +       b = &htab->buckets[batch];
> +       head = &b->head;
> +       raw_spin_lock_irqsave(&b->lock, flags);
> +

Would it be possible to avoid that lock when we're not deleting (just
batching lookup)? To be honest I don't how much impact would have to
grab that lock when concurrent additions are happening in a bpf
program.

> +       bucket_cnt = 0;
> +       hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
> +               bucket_cnt++;
> +
> +       if (bucket_cnt > (max_count - total)) {
> +               if (total == 0)
> +                       ret = -ENOSPC;
> +               goto after_loop;
> +       }
> +
> +       hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) {
> +               memcpy(dst_key, l->key, key_size);
> +
> +               value = l->key + roundup_key_size;
> +               if (elem_map_flags & BPF_F_LOCK)
> +                       copy_map_value_locked(map, dst_val, value, true);
> +               else
> +                       copy_map_value(map, dst_val, value);
> +               check_and_init_map_lock(map, dst_val);
> +
> +               dst_key += key_size;
> +               dst_val += value_size;
> +               total++;
> +       }
> +
> +       if (do_delete) {
> +               hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) {
> +                       hlist_nulls_del_rcu(&l->hash_node);
> +                       if (is_lru_map)
> +                               bpf_lru_push_free(&htab->lru, &l->lru_node);
> +                       else
> +                               free_htab_elem(htab, l);
> +               }
> +       }
> +
> +       batch++;
> +       if (batch >= htab->n_buckets) {
> +               batch = 0;
> +               goto after_loop;
> +       }
> +
> +       raw_spin_unlock_irqrestore(&b->lock, flags);
> +       goto again;
> +
> +after_loop:
> +       raw_spin_unlock_irqrestore(&b->lock, flags);
> +
> +       rcu_read_unlock();
> +       this_cpu_dec(bpf_prog_active);
> +       preempt_enable();
> +
> +       /* copy data back to user */
> +       ukeys = u64_to_user_ptr(attr->batch.keys);
> +       uvalues = u64_to_user_ptr(attr->batch.values);
> +       if (put_user(batch, &uattr->batch.batch) ||
> +           copy_to_user(ukeys, keys, total * key_size) ||
> +           copy_to_user(uvalues, values, total * value_size) ||
> +           put_user(total, &uattr->batch.count))
> +               ret = -EFAULT;
> +
> +out:
> +       kvfree(keys);
> +       kvfree(values);
> +       return ret;
> +}
> +
> +static int
> +__htab_map_update_batch(struct bpf_map *map, const union bpf_attr *attr,
> +                       union bpf_attr __user *uattr, bool is_lru_map)
> +{
> +       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> +       u32 count, max_count, key_size, roundup_key_size, value_size;
> +       u64 elem_map_flags, map_flags;
> +       void __user *ukey, *uvalue;
> +       void *key, *value;
> +       int ret = 0;
> +
> +       max_count = attr->batch.count;
> +       if (!max_count)
> +               return 0;
> +
> +       elem_map_flags = attr->batch.elem_flags;
> +       if ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))
> +               return -EINVAL;
> +
> +       map_flags = attr->batch.flags;
> +       if (map_flags)
> +               return -EINVAL;
> +
> +       key_size = htab->map.key_size;
> +       roundup_key_size = round_up(htab->map.key_size, 8);
> +       value_size = htab->map.value_size;
> +       key = kmalloc(key_size, GFP_USER | __GFP_NOWARN);
> +       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
> +       if (!key || !value) {
> +               ret = -ENOMEM;
> +               goto out;
> +       }
> +
> +       ukey = u64_to_user_ptr(attr->batch.keys);
> +       uvalue = u64_to_user_ptr(attr->batch.values);
> +       for (count = 0; count < max_count; count++) {
> +               if (copy_from_user(key, ukey + count * key_size, key_size) ||
> +                   copy_from_user(value, uvalue + count * value_size, value_size)) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +
> +               preempt_disable();
> +               __this_cpu_inc(bpf_prog_active);
> +               rcu_read_lock();
> +               if (is_lru_map)
> +                       ret = htab_lru_map_update_elem(map, key, value, elem_map_flags);
> +               else
> +                       ret = htab_map_update_elem(map, key, value, elem_map_flags);
> +               rcu_read_unlock();
> +               __this_cpu_dec(bpf_prog_active);
> +               preempt_enable();
> +
> +               if (ret) {
> +                       if (put_user(count, &uattr->batch.count))
> +                               ret = -EFAULT;
> +                       break;
> +               }
> +       }
> +
> +out:
> +       kfree(key);
> +       kfree(value);
> +       return ret;
> +}
> +
> +static int
> +__htab_map_delete_batch(struct bpf_map *map,
> +                       const union bpf_attr *attr,
> +                       union bpf_attr __user *uattr,
> +                       bool is_lru_map)
> +{
> +       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> +       u64 elem_map_flags, map_flags;
> +       struct hlist_nulls_head *head;
> +       struct hlist_nulls_node *n;
> +       u32 batch, max_count;
> +       unsigned long flags;
> +       struct htab_elem *l;
> +       struct bucket *b;
> +
> +       elem_map_flags = attr->batch.elem_flags;
> +       map_flags = attr->batch.flags;
> +       if (elem_map_flags || map_flags)
> +               return -EINVAL;
> +
> +       max_count = attr->batch.count;
> +       batch = (u32)attr->batch.batch;
> +       if (max_count || batch >= htab->n_buckets)
> +               return -EINVAL;
> +
> +       preempt_disable();
> +       __this_cpu_inc(bpf_prog_active);
> +       rcu_read_lock();
> +
> +again:
> +       b = &htab->buckets[batch];
> +       head = &b->head;
> +       raw_spin_lock_irqsave(&b->lock, flags);
> +
> +       hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) {
> +               hlist_nulls_del_rcu(&l->hash_node);
> +               if (is_lru_map)
> +                       bpf_lru_push_free(&htab->lru, &l->lru_node);
> +               else
> +                       free_htab_elem(htab, l);
> +       }
> +
> +       batch++;
> +       if (batch >= htab->n_buckets)
> +               goto out;
> +
> +       raw_spin_unlock_irqrestore(&b->lock, flags);
> +       goto again;
> +
> +out:
> +       raw_spin_unlock_irqrestore(&b->lock, flags);
> +       rcu_read_unlock();
> +       __this_cpu_dec(bpf_prog_active);
> +       preempt_enable();
> +
> +       return 0;
> +}
> +
> +static int
> +htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
> +                     union bpf_attr __user *uattr)
> +{
> +       return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
> +                                                 false);
> +}
> +
> +static int
> +htab_map_lookup_and_delete_batch(struct bpf_map *map,
> +                                const union bpf_attr *attr,
> +                                union bpf_attr __user *uattr)
> +{
> +       return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
> +                                                 false);
> +}
> +
> +static int
> +htab_map_update_batch(struct bpf_map *map, const union bpf_attr *attr,
> +                     union bpf_attr __user *uattr)
> +{
> +       return __htab_map_update_batch(map, attr, uattr, false);
> +}
> +
> +static int
> +htab_map_delete_batch(struct bpf_map *map,
> +                     const union bpf_attr *attr,
> +                     union bpf_attr __user *uattr)
> +{
> +       return __htab_map_delete_batch(map, attr, uattr, false);
> +}
> +
> +static int
> +htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
> +                         union bpf_attr __user *uattr)
> +{
> +       return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
> +                                                 true);
> +}
> +
> +static int
> +htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
> +                                    const union bpf_attr *attr,
> +                                    union bpf_attr __user *uattr)
> +{
> +       return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
> +                                                 true);
> +}
> +
> +static int
> +htab_lru_map_update_batch(struct bpf_map *map, const union bpf_attr *attr,
> +                         union bpf_attr __user *uattr)
> +{
> +       return __htab_map_update_batch(map, attr, uattr, true);
> +}
> +
> +static int
> +htab_lru_map_delete_batch(struct bpf_map *map,
> +                         const union bpf_attr *attr,
> +                         union bpf_attr __user *uattr)
> +{
> +       return __htab_map_delete_batch(map, attr, uattr, true);
> +}
> +
>  const struct bpf_map_ops htab_map_ops = {
>         .map_alloc_check = htab_map_alloc_check,
>         .map_alloc = htab_map_alloc,
> @@ -1242,6 +1558,10 @@ const struct bpf_map_ops htab_map_ops = {
>         .map_delete_elem = htab_map_delete_elem,
>         .map_gen_lookup = htab_map_gen_lookup,
>         .map_seq_show_elem = htab_map_seq_show_elem,
> +       .map_lookup_batch = htab_map_lookup_batch,
> +       .map_lookup_and_delete_batch = htab_map_lookup_and_delete_batch,
> +       .map_update_batch = htab_map_update_batch,
> +       .map_delete_batch = htab_map_delete_batch,
>  };
>
>  const struct bpf_map_ops htab_lru_map_ops = {
> @@ -1255,6 +1575,10 @@ const struct bpf_map_ops htab_lru_map_ops = {
>         .map_delete_elem = htab_lru_map_delete_elem,
>         .map_gen_lookup = htab_lru_map_gen_lookup,
>         .map_seq_show_elem = htab_map_seq_show_elem,
> +       .map_lookup_batch = htab_lru_map_lookup_batch,
> +       .map_lookup_and_delete_batch = htab_lru_map_lookup_and_delete_batch,
> +       .map_update_batch = htab_lru_map_update_batch,
> +       .map_delete_batch = htab_lru_map_delete_batch,
>  };
>
>  /* Called from eBPF program */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index ca60eafa6922..e83bdf7efbd8 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2816,6 +2816,62 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
>         return err;
>  }
>
> +#define BPF_MAP_BATCH_LAST_FIELD batch.flags
> +
> +#define BPF_DO_BATCH(fn)                       \
> +       do {                                    \
> +               if (!fn) {                      \
> +                       err = -ENOTSUPP;        \
> +                       goto err_put;           \
> +               }                               \
> +               err = fn(map, attr, uattr);     \
> +       } while(0)
> +
> +static int bpf_map_do_batch(const union bpf_attr *attr,
> +                           union bpf_attr __user *uattr,
> +                           int cmd)
> +{
> +       struct bpf_map *map;
> +       int err, ufd;
> +       struct fd f;
> +
> +       if (CHECK_ATTR(BPF_MAP_BATCH))
> +               return -EINVAL;
> +
> +       ufd = attr->batch.map_fd;
> +       f = fdget(ufd);
> +       map = __bpf_map_get(f);
> +       if (IS_ERR(map))
> +               return PTR_ERR(map);
> +
> +       if ((cmd == BPF_MAP_LOOKUP_BATCH ||
> +            cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
> +           !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
> +               err = -EPERM;
> +               goto err_put;
> +       }
> +
> +       if (cmd != BPF_MAP_LOOKUP_BATCH &&
> +           !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
> +               err = -EPERM;
> +               goto err_put;
> +       }
> +
> +       if (cmd == BPF_MAP_LOOKUP_BATCH) {
> +               BPF_DO_BATCH(map->ops->map_lookup_batch);
> +       } else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) {
> +               BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
> +       } else if (cmd == BPF_MAP_UPDATE_BATCH) {
> +               BPF_DO_BATCH(map->ops->map_update_batch);
> +       } else {
> +               BPF_DO_BATCH(map->ops->map_delete_batch);
> +       }
> +
> +err_put:
> +       fdput(f);
> +       return err;
> +}
> +
>  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
>  {
>         union bpf_attr attr = {};
> @@ -2913,6 +2969,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>         case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
>                 err = map_lookup_and_delete_elem(&attr);
>                 break;
> +       case BPF_MAP_LOOKUP_BATCH:
> +               err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
> +               break;
> +       case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
> +               err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_AND_DELETE_BATCH);
> +               break;
> +       case BPF_MAP_UPDATE_BATCH:
> +               err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
> +               break;
> +       case BPF_MAP_DELETE_BATCH:
> +               err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
> +               break;
>         default:
>                 err = -EINVAL;
>                 break;
> --
> 2.17.1
>

In general it'd be great if we could express the old functions
(get_next_key, lookup, delete) in terms of this new ones so we  avoid
having twice the code.

Also to be honest I don't see how batching updates would be useful,
maybe just try to do batching lookup and lookup_and_delete?