Hi Yonghong, thanks for working on this! I have some concerns about this implementation but overall I think this might work for our use case too! On Sun, Sep 8, 2019 at 1:11 AM Yonghong Song <yhs@xxxxxx> wrote: > > Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one > map entries per syscall. > https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@xxxxxxxxxxxxxx/T/#t > > During discussion, we found more use cases can be supported in a similar > map operation batching framework. For example, batched map lookup and delete, > which can be really helpful for bcc. > https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243 > https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138 > > Also, in bcc, we have API to delete all entries in a map. > https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264 > > For map update, batched operations also useful as sometimes applications need > to populate initial maps with more than one entry. For example, the below > example is from kernel/samples/bpf/xdp_redirect_cpu_user.c: > https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550 > > This patch addresses all the above use cases. To make uapi stable, it also > covers other potential use cases. For bpf syscall subcommands are introduced: > BPF_MAP_LOOKUP_BATCH > BPF_MAP_LOOKUP_AND_DELETE_BATCH > BPF_MAP_UPDATE_BATCH > BPF_MAP_DELETE_BATCH > > The UAPI attribute structure looks like: > > struct { /* struct used by BPF_MAP_*_BATCH commands */ > __u64 batch; /* input/output: > * input: start batch, > * 0 to start from beginning. > * output: next start batch, > * 0 to end batching. > */ > __aligned_u64 keys; > __aligned_u64 values; > __u32 count; /* input/output: > * input: # of elements keys/values. > * output: # of filled elements. > */ > __u32 map_fd; > __u64 elem_flags; > __u64 flags; > } batch; > > An opaque value 'batch' is used for user/kernel space communication > for where in the map to start the operation for lookup/lookup_and_delete/delete. > input 'batch' = 0: to start the operation from the beginning of the map. > output 'batch': if not 0, the next input for batch operation. > > For lookup/lookup_and_delete: > operation: lookup/lookup_and_delete starting from a particular 'batch'. > return: > 'batch' 'count' return code meaning > 0 0 0 Done. Nothing left > 0 0 -ENOSPC no space to handle batch 0 > > 0 0 -ENOSPC no space to handle 'batch' > > 0 > 0 0 stopped right before 'batch' > Note that: > (1). Even if return code is 0 and return 'count' > 0, the return 'count' may > not be equal to input 'count'. This happens when there is no enough space > to handle a batch. > (2). If the return code is an error and not -EFAULT, > 'batch' indicates the batch has issues and 'count' indicates the number > of elements successfully processed. > > For delete: > operation: deletion starting from a particular 'batch'. > return: 0 means everything is deleted from 'batch'. > error code means something deletion not happening. > > For update: > operation: update 'count' number of elements in 'keys'/'values'. > return: 0 means successful updates for all elements. > error code, if not -EFAULT, 'count' is the number of successful updates. > > Signed-off-by: Yonghong Song <yhs@xxxxxx> > --- > include/linux/bpf.h | 9 ++ > include/uapi/linux/bpf.h | 22 +++ > kernel/bpf/hashtab.c | 324 +++++++++++++++++++++++++++++++++++++++ > kernel/bpf/syscall.c | 68 ++++++++ > 4 files changed, 423 insertions(+) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 5b9d22338606..3c1302e8e2d4 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -37,6 +37,15 @@ struct bpf_map_ops { > int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); > void (*map_release_uref)(struct bpf_map *map); > void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); > + int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, > + union bpf_attr __user *uattr); > + int (*map_lookup_and_delete_batch)(struct bpf_map *map, > + const union bpf_attr *attr, > + union bpf_attr __user *uattr); > + int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr, > + union bpf_attr __user *uattr); > + int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, > + union bpf_attr __user *uattr); > > /* funcs callable from userspace and from eBPF programs */ > void *(*map_lookup_elem)(struct bpf_map *map, void *key); > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 5d2fb183ee2d..9d4f76073dd9 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -107,6 +107,10 @@ enum bpf_cmd { > BPF_MAP_LOOKUP_AND_DELETE_ELEM, > BPF_MAP_FREEZE, > BPF_BTF_GET_NEXT_ID, > + BPF_MAP_LOOKUP_BATCH, > + BPF_MAP_LOOKUP_AND_DELETE_BATCH, > + BPF_MAP_UPDATE_BATCH, > + BPF_MAP_DELETE_BATCH, > }; > > enum bpf_map_type { > @@ -396,6 +400,24 @@ union bpf_attr { > __u64 flags; > }; > > + struct { /* struct used by BPF_MAP_*_BATCH commands */ > + __u64 batch; /* input/output: > + * input: start batch, > + * 0 to start from beginning. > + * output: next start batch, > + * 0 to end batching. > + */ > + __aligned_u64 keys; > + __aligned_u64 values; > + __u32 count; /* input/output: > + * input: # of elements keys/values. > + * output: # of filled elements. > + */ > + __u32 map_fd; > + __u64 elem_flags; > + __u64 flags; > + } batch; > + > struct { /* anonymous struct used by BPF_PROG_LOAD command */ > __u32 prog_type; /* one of enum bpf_prog_type */ > __u32 insn_cnt; > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c > index 22066a62c8c9..ee7b90200f4d 100644 > --- a/kernel/bpf/hashtab.c > +++ b/kernel/bpf/hashtab.c > @@ -1232,6 +1232,322 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, > rcu_read_unlock(); > } > > +static int > +__htab_map_lookup_and_delete_batch(struct bpf_map *map, > + const union bpf_attr *attr, > + union bpf_attr __user *uattr, > + bool do_delete, bool is_lru_map) > +{ > + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); > + u32 bucket_cnt, total, key_size, value_size, roundup_key_size; > + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; > + u64 elem_map_flags, map_flags; > + struct hlist_nulls_head *head; > + void __user *ukeys, *uvalues; > + struct hlist_nulls_node *n; > + u32 batch, max_count; > + unsigned long flags; > + struct htab_elem *l; > + struct bucket *b; > + int ret = 0; > + > + max_count = attr->batch.count; > + if (!max_count) > + return 0; > + > + elem_map_flags = attr->batch.elem_flags; > + if ((elem_map_flags & ~BPF_F_LOCK) || > + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) > + return -EINVAL; > + > + map_flags = attr->batch.flags; > + if (map_flags) > + return -EINVAL; > + > + batch = (u32)attr->batch.batch; > + if (batch >= htab->n_buckets) > + return -EINVAL; > + > + /* We cannot do copy_from_user or copy_to_user inside > + * the rcu_read_lock. Allocate enough space here. > + */ > + key_size = htab->map.key_size; > + roundup_key_size = round_up(htab->map.key_size, 8); > + value_size = htab->map.value_size; > + keys = kvmalloc(key_size * max_count, GFP_USER | __GFP_NOWARN); > + values = kvmalloc(value_size * max_count, GFP_USER | __GFP_NOWARN); > + if (!keys || !values) { > + ret = -ENOMEM; > + goto out; > + } > + > + dst_key = keys; > + dst_val = values; > + total = 0; > + > + preempt_disable(); > + this_cpu_inc(bpf_prog_active); > + rcu_read_lock(); > + > +again: > + b = &htab->buckets[batch]; > + head = &b->head; > + raw_spin_lock_irqsave(&b->lock, flags); > + Would it be possible to avoid that lock when we're not deleting (just batching lookup)? To be honest I don't how much impact would have to grab that lock when concurrent additions are happening in a bpf program. > + bucket_cnt = 0; > + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) > + bucket_cnt++; > + > + if (bucket_cnt > (max_count - total)) { > + if (total == 0) > + ret = -ENOSPC; > + goto after_loop; > + } > + > + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) { > + memcpy(dst_key, l->key, key_size); > + > + value = l->key + roundup_key_size; > + if (elem_map_flags & BPF_F_LOCK) > + copy_map_value_locked(map, dst_val, value, true); > + else > + copy_map_value(map, dst_val, value); > + check_and_init_map_lock(map, dst_val); > + > + dst_key += key_size; > + dst_val += value_size; > + total++; > + } > + > + if (do_delete) { > + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) { > + hlist_nulls_del_rcu(&l->hash_node); > + if (is_lru_map) > + bpf_lru_push_free(&htab->lru, &l->lru_node); > + else > + free_htab_elem(htab, l); > + } > + } > + > + batch++; > + if (batch >= htab->n_buckets) { > + batch = 0; > + goto after_loop; > + } > + > + raw_spin_unlock_irqrestore(&b->lock, flags); > + goto again; > + > +after_loop: > + raw_spin_unlock_irqrestore(&b->lock, flags); > + > + rcu_read_unlock(); > + this_cpu_dec(bpf_prog_active); > + preempt_enable(); > + > + /* copy data back to user */ > + ukeys = u64_to_user_ptr(attr->batch.keys); > + uvalues = u64_to_user_ptr(attr->batch.values); > + if (put_user(batch, &uattr->batch.batch) || > + copy_to_user(ukeys, keys, total * key_size) || > + copy_to_user(uvalues, values, total * value_size) || > + put_user(total, &uattr->batch.count)) > + ret = -EFAULT; > + > +out: > + kvfree(keys); > + kvfree(values); > + return ret; > +} > + > +static int > +__htab_map_update_batch(struct bpf_map *map, const union bpf_attr *attr, > + union bpf_attr __user *uattr, bool is_lru_map) > +{ > + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); > + u32 count, max_count, key_size, roundup_key_size, value_size; > + u64 elem_map_flags, map_flags; > + void __user *ukey, *uvalue; > + void *key, *value; > + int ret = 0; > + > + max_count = attr->batch.count; > + if (!max_count) > + return 0; > + > + elem_map_flags = attr->batch.elem_flags; > + if ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)) > + return -EINVAL; > + > + map_flags = attr->batch.flags; > + if (map_flags) > + return -EINVAL; > + > + key_size = htab->map.key_size; > + roundup_key_size = round_up(htab->map.key_size, 8); > + value_size = htab->map.value_size; > + key = kmalloc(key_size, GFP_USER | __GFP_NOWARN); > + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); > + if (!key || !value) { > + ret = -ENOMEM; > + goto out; > + } > + > + ukey = u64_to_user_ptr(attr->batch.keys); > + uvalue = u64_to_user_ptr(attr->batch.values); > + for (count = 0; count < max_count; count++) { > + if (copy_from_user(key, ukey + count * key_size, key_size) || > + copy_from_user(value, uvalue + count * value_size, value_size)) { > + ret = -EFAULT; > + break; > + } > + > + preempt_disable(); > + __this_cpu_inc(bpf_prog_active); > + rcu_read_lock(); > + if (is_lru_map) > + ret = htab_lru_map_update_elem(map, key, value, elem_map_flags); > + else > + ret = htab_map_update_elem(map, key, value, elem_map_flags); > + rcu_read_unlock(); > + __this_cpu_dec(bpf_prog_active); > + preempt_enable(); > + > + if (ret) { > + if (put_user(count, &uattr->batch.count)) > + ret = -EFAULT; > + break; > + } > + } > + > +out: > + kfree(key); > + kfree(value); > + return ret; > +} > + > +static int > +__htab_map_delete_batch(struct bpf_map *map, > + const union bpf_attr *attr, > + union bpf_attr __user *uattr, > + bool is_lru_map) > +{ > + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); > + u64 elem_map_flags, map_flags; > + struct hlist_nulls_head *head; > + struct hlist_nulls_node *n; > + u32 batch, max_count; > + unsigned long flags; > + struct htab_elem *l; > + struct bucket *b; > + > + elem_map_flags = attr->batch.elem_flags; > + map_flags = attr->batch.flags; > + if (elem_map_flags || map_flags) > + return -EINVAL; > + > + max_count = attr->batch.count; > + batch = (u32)attr->batch.batch; > + if (max_count || batch >= htab->n_buckets) > + return -EINVAL; > + > + preempt_disable(); > + __this_cpu_inc(bpf_prog_active); > + rcu_read_lock(); > + > +again: > + b = &htab->buckets[batch]; > + head = &b->head; > + raw_spin_lock_irqsave(&b->lock, flags); > + > + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) { > + hlist_nulls_del_rcu(&l->hash_node); > + if (is_lru_map) > + bpf_lru_push_free(&htab->lru, &l->lru_node); > + else > + free_htab_elem(htab, l); > + } > + > + batch++; > + if (batch >= htab->n_buckets) > + goto out; > + > + raw_spin_unlock_irqrestore(&b->lock, flags); > + goto again; > + > +out: > + raw_spin_unlock_irqrestore(&b->lock, flags); > + rcu_read_unlock(); > + __this_cpu_dec(bpf_prog_active); > + preempt_enable(); > + > + return 0; > +} > + > +static int > +htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, > + union bpf_attr __user *uattr) > +{ > + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, > + false); > +} > + > +static int > +htab_map_lookup_and_delete_batch(struct bpf_map *map, > + const union bpf_attr *attr, > + union bpf_attr __user *uattr) > +{ > + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, > + false); > +} > + > +static int > +htab_map_update_batch(struct bpf_map *map, const union bpf_attr *attr, > + union bpf_attr __user *uattr) > +{ > + return __htab_map_update_batch(map, attr, uattr, false); > +} > + > +static int > +htab_map_delete_batch(struct bpf_map *map, > + const union bpf_attr *attr, > + union bpf_attr __user *uattr) > +{ > + return __htab_map_delete_batch(map, attr, uattr, false); > +} > + > +static int > +htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, > + union bpf_attr __user *uattr) > +{ > + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, > + true); > +} > + > +static int > +htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, > + const union bpf_attr *attr, > + union bpf_attr __user *uattr) > +{ > + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, > + true); > +} > + > +static int > +htab_lru_map_update_batch(struct bpf_map *map, const union bpf_attr *attr, > + union bpf_attr __user *uattr) > +{ > + return __htab_map_update_batch(map, attr, uattr, true); > +} > + > +static int > +htab_lru_map_delete_batch(struct bpf_map *map, > + const union bpf_attr *attr, > + union bpf_attr __user *uattr) > +{ > + return __htab_map_delete_batch(map, attr, uattr, true); > +} > + > const struct bpf_map_ops htab_map_ops = { > .map_alloc_check = htab_map_alloc_check, > .map_alloc = htab_map_alloc, > @@ -1242,6 +1558,10 @@ const struct bpf_map_ops htab_map_ops = { > .map_delete_elem = htab_map_delete_elem, > .map_gen_lookup = htab_map_gen_lookup, > .map_seq_show_elem = htab_map_seq_show_elem, > + .map_lookup_batch = htab_map_lookup_batch, > + .map_lookup_and_delete_batch = htab_map_lookup_and_delete_batch, > + .map_update_batch = htab_map_update_batch, > + .map_delete_batch = htab_map_delete_batch, > }; > > const struct bpf_map_ops htab_lru_map_ops = { > @@ -1255,6 +1575,10 @@ const struct bpf_map_ops htab_lru_map_ops = { > .map_delete_elem = htab_lru_map_delete_elem, > .map_gen_lookup = htab_lru_map_gen_lookup, > .map_seq_show_elem = htab_map_seq_show_elem, > + .map_lookup_batch = htab_lru_map_lookup_batch, > + .map_lookup_and_delete_batch = htab_lru_map_lookup_and_delete_batch, > + .map_update_batch = htab_lru_map_update_batch, > + .map_delete_batch = htab_lru_map_delete_batch, > }; > > /* Called from eBPF program */ > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index ca60eafa6922..e83bdf7efbd8 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -2816,6 +2816,62 @@ static int bpf_task_fd_query(const union bpf_attr *attr, > return err; > } > > +#define BPF_MAP_BATCH_LAST_FIELD batch.flags > + > +#define BPF_DO_BATCH(fn) \ > + do { \ > + if (!fn) { \ > + err = -ENOTSUPP; \ > + goto err_put; \ > + } \ > + err = fn(map, attr, uattr); \ > + } while(0) > + > +static int bpf_map_do_batch(const union bpf_attr *attr, > + union bpf_attr __user *uattr, > + int cmd) > +{ > + struct bpf_map *map; > + int err, ufd; > + struct fd f; > + > + if (CHECK_ATTR(BPF_MAP_BATCH)) > + return -EINVAL; > + > + ufd = attr->batch.map_fd; > + f = fdget(ufd); > + map = __bpf_map_get(f); > + if (IS_ERR(map)) > + return PTR_ERR(map); > + > + if ((cmd == BPF_MAP_LOOKUP_BATCH || > + cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && > + !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { > + err = -EPERM; > + goto err_put; > + } > + > + if (cmd != BPF_MAP_LOOKUP_BATCH && > + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { > + err = -EPERM; > + goto err_put; > + } > + > + if (cmd == BPF_MAP_LOOKUP_BATCH) { > + BPF_DO_BATCH(map->ops->map_lookup_batch); > + } else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) { > + BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch); > + } else if (cmd == BPF_MAP_UPDATE_BATCH) { > + BPF_DO_BATCH(map->ops->map_update_batch); > + } else { > + BPF_DO_BATCH(map->ops->map_delete_batch); > + } > + > +err_put: > + fdput(f); > + return err; > +} > + > SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) > { > union bpf_attr attr = {}; > @@ -2913,6 +2969,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz > case BPF_MAP_LOOKUP_AND_DELETE_ELEM: > err = map_lookup_and_delete_elem(&attr); > break; > + case BPF_MAP_LOOKUP_BATCH: > + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); > + break; > + case BPF_MAP_LOOKUP_AND_DELETE_BATCH: > + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_AND_DELETE_BATCH); > + break; > + case BPF_MAP_UPDATE_BATCH: > + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH); > + break; > + case BPF_MAP_DELETE_BATCH: > + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); > + break; > default: > err = -EINVAL; > break; > -- > 2.17.1 > In general it'd be great if we could express the old functions (get_next_key, lookup, delete) in terms of this new ones so we avoid having twice the code. Also to be honest I don't see how batching updates would be useful, maybe just try to do batching lookup and lookup_and_delete?