On 1/14/20 3:49 PM, Brian Vazquez wrote: > Hi Yonghong, thanks for reviewing it! > > On Tue, Jan 14, 2020 at 2:56 PM Yonghong Song <yhs@xxxxxx> wrote: >> >> >> >> On 1/14/20 8:46 AM, Brian Vazquez wrote: >>> From: Yonghong Song <yhs@xxxxxx> >>> >>> htab can't use generic batch support due some problematic behaviours >>> inherent to the data structre, i.e. while iterating the bpf map a >>> concurrent program might delete the next entry that batch was about to >>> use, in that case there's no easy solution to retrieve the next entry, >>> the issue has been discussed multiple times (see [1] and [2]). >>> >>> The only way hmap can be traversed without the problem previously >>> exposed is by making sure that the map is traversing entire buckets. >>> This commit implements those strict requirements for hmap, the >>> implementation follows the same interaction that generic support with >>> some exceptions: >>> >>> - If keys/values buffer are not big enough to traverse a bucket, >>> ENOSPC will be returned. >>> - out_batch contains the value of the next bucket in the iteration, not >>> the next key, but this is transparent for the user since the user >>> should never use out_batch for other than bpf batch syscalls. >>> >>> This commits implements BPF_MAP_LOOKUP_BATCH and adds support for new >>> command BPF_MAP_LOOKUP_AND_DELETE_BATCH. Note that for update/delete >>> batch ops it is possible to use the generic implementations. >>> >>> [1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@xxxxxxxxxx/ >>> [2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@xxxxxx/ >>> >>> Signed-off-by: Yonghong Song <yhs@xxxxxx> >>> Signed-off-by: Brian Vazquez <brianvv@xxxxxxxxxx> >>> --- >>> include/linux/bpf.h | 3 + >>> include/uapi/linux/bpf.h | 1 + >>> kernel/bpf/hashtab.c | 258 +++++++++++++++++++++++++++++++++++++++ >>> kernel/bpf/syscall.c | 9 +- >>> 4 files changed, 270 insertions(+), 1 deletion(-) >>> >>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >>> index 05466ad6cf1c5..3517e32149a4f 100644 >>> --- a/include/linux/bpf.h >>> +++ b/include/linux/bpf.h >>> @@ -46,6 +46,9 @@ struct bpf_map_ops { >>> void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); >>> int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, >>> union bpf_attr __user *uattr); >>> + int (*map_lookup_and_delete_batch)(struct bpf_map *map, >>> + const union bpf_attr *attr, >>> + union bpf_attr __user *uattr); >>> int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr, >>> union bpf_attr __user *uattr); >>> int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, >>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >>> index e8df9ca680e0c..9536729a03d57 100644 >>> --- a/include/uapi/linux/bpf.h >>> +++ b/include/uapi/linux/bpf.h >>> @@ -108,6 +108,7 @@ enum bpf_cmd { >>> BPF_MAP_FREEZE, >>> BPF_BTF_GET_NEXT_ID, >>> BPF_MAP_LOOKUP_BATCH, >>> + BPF_MAP_LOOKUP_AND_DELETE_BATCH, >>> BPF_MAP_UPDATE_BATCH, >>> BPF_MAP_DELETE_BATCH, >>> }; >>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c >>> index 22066a62c8c97..d9888acfd632b 100644 >>> --- a/kernel/bpf/hashtab.c >>> +++ b/kernel/bpf/hashtab.c >>> @@ -17,6 +17,16 @@ >>> (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ >>> BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) >>> >>> +#define BATCH_OPS(_name) \ >>> + .map_lookup_batch = \ >>> + _name##_map_lookup_batch, \ >>> + .map_lookup_and_delete_batch = \ >>> + _name##_map_lookup_and_delete_batch, \ >>> + .map_update_batch = \ >>> + generic_map_update_batch, \ >>> + .map_delete_batch = \ >>> + generic_map_delete_batch >>> + >>> struct bucket { >>> struct hlist_nulls_head head; >>> raw_spinlock_t lock; >>> @@ -1232,6 +1242,250 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, >>> rcu_read_unlock(); >>> } >>> >>> +static int >>> +__htab_map_lookup_and_delete_batch(struct bpf_map *map, >>> + const union bpf_attr *attr, >>> + union bpf_attr __user *uattr, >>> + bool do_delete, bool is_lru_map, >>> + bool is_percpu) >>> +{ >>> + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); >>> + u32 bucket_cnt, total, key_size, value_size, roundup_key_size; >>> + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; >>> + void __user *uvalues = u64_to_user_ptr(attr->batch.values); >>> + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); >>> + void *ubatch = u64_to_user_ptr(attr->batch.in_batch); >>> + u32 batch, max_count, size, bucket_size; >>> + u64 elem_map_flags, map_flags; >>> + struct hlist_nulls_head *head; >>> + struct hlist_nulls_node *n; >>> + unsigned long flags; >>> + struct htab_elem *l; >>> + struct bucket *b; >>> + int ret = 0; >>> + >>> + elem_map_flags = attr->batch.elem_flags; >>> + if ((elem_map_flags & ~BPF_F_LOCK) || >>> + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) >>> + return -EINVAL; >>> + >>> + map_flags = attr->batch.flags; >>> + if (map_flags) >>> + return -EINVAL; >>> + >>> + max_count = attr->batch.count; >>> + if (!max_count) >>> + return 0; >>> + >>> + batch = 0; >>> + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) >>> + return -EFAULT; >>> + >>> + if (batch >= htab->n_buckets) >>> + return -ENOENT; >>> + >>> + key_size = htab->map.key_size; >>> + roundup_key_size = round_up(htab->map.key_size, 8); >>> + value_size = htab->map.value_size; >>> + size = round_up(value_size, 8); >>> + if (is_percpu) >>> + value_size = size * num_possible_cpus(); >>> + total = 0; >>> + bucket_size = 1; >> >> Have you checked typical hash table linklist length? > While testing with hash tables ranging from 10 to 1000 entries I saw > linked lists of upto 5 entries. >> Maybe initial value bucket_size = 2 is able to cover most common cases? > I think 4-5 is still a reasonable number, what do you think? 5 should be okay. You can add some comments to explain why "5". >> >>> + >>> +alloc: >>> + /* We cannot do copy_from_user or copy_to_user inside >>> + * the rcu_read_lock. Allocate enough space here. >>> + */ >>> + keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN); >>> + values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN); >>> + if (!keys || !values) { >>> + ret = -ENOMEM; >>> + goto out; >> >> In this case, we won't copy batch and total to user buffer. Maybe we >> should do that? > Yes, I think last line should be: goto after_loop; >> >> >>> + } >>> + >>> +again: >>> + preempt_disable(); >>> + this_cpu_inc(bpf_prog_active); >>> + rcu_read_lock(); >>> +again_nocopy: >>> + dst_key = keys; >>> + dst_val = values; >>> + b = &htab->buckets[batch]; >>> + head = &b->head; >>> + raw_spin_lock_irqsave(&b->lock, flags); >>> + >>> + bucket_cnt = 0; >>> + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) >>> + bucket_cnt++; >>> + >>> + if (bucket_cnt > (max_count - total)) { >>> + if (total == 0) >>> + ret = -ENOSPC; >>> + raw_spin_unlock_irqrestore(&b->lock, flags); >>> + rcu_read_unlock(); >>> + this_cpu_dec(bpf_prog_active); >>> + preempt_enable(); >>> + goto after_loop; >>> + } >>> + >>> + if (bucket_cnt > bucket_size) { >>> + bucket_size = bucket_cnt; >>> + raw_spin_unlock_irqrestore(&b->lock, flags); >>> + rcu_read_unlock(); >>> + this_cpu_dec(bpf_prog_active); >>> + preempt_enable(); >>> + kvfree(keys); >>> + kvfree(values); >>> + goto alloc; >>> + } >>> + >>> + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { >>> + memcpy(dst_key, l->key, key_size); >>> + >>> + if (is_percpu) { >>> + int off = 0, cpu; >>> + void __percpu *pptr; >>> + >>> + pptr = htab_elem_get_ptr(l, map->key_size); >>> + for_each_possible_cpu(cpu) { >>> + bpf_long_memcpy(dst_val + off, >>> + per_cpu_ptr(pptr, cpu), size); >>> + off += size; >>> + } >>> + } else { >>> + value = l->key + roundup_key_size; >>> + if (elem_map_flags & BPF_F_LOCK) >>> + copy_map_value_locked(map, dst_val, value, >>> + true); >>> + else >>> + copy_map_value(map, dst_val, value); >>> + check_and_init_map_lock(map, dst_val); >>> + } >>> + if (do_delete) { >>> + hlist_nulls_del_rcu(&l->hash_node); >>> + if (is_lru_map) >>> + bpf_lru_push_free(&htab->lru, &l->lru_node); >>> + else >>> + free_htab_elem(htab, l); >>> + } >>> + dst_key += key_size; >>> + dst_val += value_size; >>> + } >>> + >>> + raw_spin_unlock_irqrestore(&b->lock, flags); >>> + /* If we are not copying data, we can go to next bucket and avoid >>> + * unlocking the rcu. >>> + */ >>> + if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { >>> + batch++; >>> + goto again_nocopy; >>> + } >>> + >>> + rcu_read_unlock(); >>> + this_cpu_dec(bpf_prog_active); >>> + preempt_enable(); >>> + if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, >>> + key_size * bucket_cnt) || >>> + copy_to_user(uvalues + total * value_size, values, >>> + value_size * bucket_cnt))) { >>> + ret = -EFAULT; >>> + goto after_loop; >>> + } >>> + >>> + total += bucket_cnt; >>> + batch++; >>> + if (batch >= htab->n_buckets) { >>> + ret = -ENOENT; >>> + goto after_loop; >>> + } >>> + goto again; >>> + >>> +after_loop: >>> + if (ret && (ret != -ENOENT && ret != -EFAULT)) >>> + goto out; >> >> We won't have many error codes reaching here, -ENOENT, -EFAULT, -ENOSPC, >> and possibly -ENOMEM. >> Maybe just >> if (ret == -EFAULT) >> goto out; >> > Yes I think that make senses, I only need to add > > if (put_user(0, &uattr->batch.count)) > return -EFAULT; > > before traversing the map to make sure that if there is an error, > batch.count doesn't miss report entries since that variable is used as > input/output. Does this make sense? This does make sense. You can put the above checking right before the "alloc" label. Everything after that will go through copying "count". > >>> + >>> + /* copy # of entries and next batch */ >>> + ubatch = u64_to_user_ptr(attr->batch.out_batch); >>> + if (copy_to_user(ubatch, &batch, sizeof(batch)) || >>> + put_user(total, &uattr->batch.count)) >>> + ret = -EFAULT; >>> + >>> +out: >>> + kvfree(keys); >>> + kvfree(values); >>> + return ret; >>> +} >>> + >> [...]