Re: [PATCH bpf-next 12/16] bpf: Support basic operations for dynptr key in hash map

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Oct 8, 2024 at 2:02 AM Hou Tao <houtao@xxxxxxxxxxxxxxx> wrote:
>
> From: Hou Tao <houtao1@xxxxxxxxxx>
>
> The patch supports lookup, update, delete and lookup_delete operations
> for hash map with dynptr map. There are two major differences between
> the implementation of normal hash map and dynptr-keyed hash map:
>
> 1) dynptr-keyed hash map doesn't support pre-allocation.
> The reason is that the dynptr in map key is allocated dynamically
> through bpf mem allocator. The length limitation for these dynptrs is
> 4088 bytes now. Because there dynptrs are allocated dynamically, the
> consumption of memory will be smaller compared with normal hash map when
> there are big differences between the length of these dynptrs.
>
> 2) the freed element in dynptr-key map will not be reused immediately
> For normal hash map, the freed element may be reused immediately by the
> newly-added element, so the lookup may return an incorrect result due to
> element deletion and element reuse. However dynptr-key map could not do
> that, there are pointers (dynptrs) in the map key and the updates of
> these dynptrs are not atomic: both the address and the length of the
> dynptr will be updated. If the element is reused immediately, the access
> of the dynptr in the freed element may incur invalid memory access due
> to the mismatch between the address and the size of dynptr, so reuse the
> freed element after one RCU grace period.
>
> Beside the differences above, dynptr-keyed hash map also needs to handle
> the maybe-nullified dynptr in the map key.
>
> Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx>
> ---
>  kernel/bpf/hashtab.c | 283 +++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 257 insertions(+), 26 deletions(-)
>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index b14b87463ee0..edf19d36a413 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -88,6 +88,7 @@ struct bpf_htab {
>         struct bpf_map map;
>         struct bpf_mem_alloc ma;
>         struct bpf_mem_alloc pcpu_ma;
> +       struct bpf_mem_alloc dynptr_ma;
>         struct bucket *buckets;
>         void *elems;
>         union {
> @@ -425,6 +426,7 @@ static int htab_map_alloc_check(union bpf_attr *attr)
>         bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
>         bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
>         bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
> +       bool dynptr_in_key = (attr->map_flags & BPF_F_DYNPTR_IN_KEY);
>         int numa_node = bpf_map_attr_numa_node(attr);
>
>         BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
> @@ -438,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
>             !bpf_map_flags_access_ok(attr->map_flags))
>                 return -EINVAL;
>
> +       if (dynptr_in_key) {
> +               if (percpu || lru || prealloc || !attr->map_extra)
> +                       return -EINVAL;
> +               if ((attr->map_extra >> 32) || bpf_dynptr_check_size(attr->map_extra) ||
> +                   bpf_mem_alloc_check_size(percpu, attr->map_extra))
> +                       return -E2BIG;
> +       }
> +
>         if (!lru && percpu_lru)
>                 return -EINVAL;
>
> @@ -482,6 +492,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>          */
>         bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
>         bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
> +       bool dynptr_in_key = (attr->map_flags & BPF_F_DYNPTR_IN_KEY);
>         struct bpf_htab *htab;
>         int err, i;
>
> @@ -598,6 +609,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>                         if (err)
>                                 goto free_map_locked;
>                 }
> +               if (dynptr_in_key) {
> +                       err = bpf_mem_alloc_init(&htab->dynptr_ma, 0, false);
> +                       if (err)
> +                               goto free_map_locked;
> +               }
>         }
>
>         return &htab->map;
> @@ -610,6 +626,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>         for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
>                 free_percpu(htab->map_locked[i]);
>         bpf_map_area_free(htab->buckets);
> +       bpf_mem_alloc_destroy(&htab->dynptr_ma);
>         bpf_mem_alloc_destroy(&htab->pcpu_ma);
>         bpf_mem_alloc_destroy(&htab->ma);
>  free_elem_count:
> @@ -620,13 +637,55 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>         return ERR_PTR(err);
>  }
>
> -static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
> +static inline u32 __htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
>  {
>         if (likely(key_len % 4 == 0))
>                 return jhash2(key, key_len / 4, hashrnd);
>         return jhash(key, key_len, hashrnd);
>  }
>
> +static u32 htab_map_dynptr_hash(const void *key, u32 key_len, u32 hashrnd,
> +                               const struct btf_record *rec)
> +{
> +       unsigned int i, cnt = rec->cnt;
> +       unsigned int hash = hashrnd;
> +       unsigned int offset = 0;
> +
> +       for (i = 0; i < cnt; i++) {
> +               const struct btf_field *field = &rec->fields[i];
> +               const struct bpf_dynptr_kern *kptr;
> +               unsigned int len;
> +
> +               if (field->type != BPF_DYNPTR)
> +                       continue;
> +
> +               /* non-dynptr part ? */
> +               if (offset < field->offset)
> +                       hash = jhash(key + offset, field->offset - offset, hash);
> +
> +               /* Skip nullified dynptr */
> +               kptr = key + field->offset;
> +               if (kptr->data) {
> +                       len = __bpf_dynptr_size(kptr);
> +                       hash = jhash(__bpf_dynptr_data(kptr, len), len, hash);
> +               }
> +               offset = field->offset + field->size;
> +       }
> +
> +       if (offset < key_len)
> +               hash = jhash(key + offset, key_len - offset, hash);
> +
> +       return hash;
> +}
> +
> +static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd,
> +                               const struct btf_record *rec)
> +{
> +       if (!rec)
> +               return __htab_map_hash(key, key_len, hashrnd);
> +       return htab_map_dynptr_hash(key, key_len, hashrnd, rec);
> +}
> +
>  static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
>  {
>         return &htab->buckets[hash & (htab->n_buckets - 1)];
> @@ -637,15 +696,68 @@ static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32
>         return &__select_bucket(htab, hash)->head;
>  }
>
> +static bool is_same_dynptr_key(const void *key, const void *tgt, unsigned int key_size,
> +                              const struct btf_record *rec)
> +{
> +       unsigned int i, cnt = rec->cnt;
> +       unsigned int offset = 0;
> +
> +       for (i = 0; i < cnt; i++) {
> +               const struct btf_field *field = &rec->fields[i];
> +               const struct bpf_dynptr_kern *kptr, *tgt_kptr;
> +               const void *data, *tgt_data;
> +               unsigned int len;
> +
> +               if (field->type != BPF_DYNPTR)
> +                       continue;
> +
> +               if (offset < field->offset &&
> +                   memcmp(key + offset, tgt + offset, field->offset - offset))
> +                       return false;
> +
> +               /*
> +                * For a nullified dynptr in the target key, __bpf_dynptr_size()
> +                * will return 0, and there will be no match for the target key.
> +                */
> +               kptr = key + field->offset;
> +               tgt_kptr = tgt + field->offset;
> +               len = __bpf_dynptr_size(kptr);
> +               if (len != __bpf_dynptr_size(tgt_kptr))
> +                       return false;
> +
> +               data = __bpf_dynptr_data(kptr, len);
> +               tgt_data = __bpf_dynptr_data(tgt_kptr, len);
> +               if (memcmp(data, tgt_data, len))
> +                       return false;
> +
> +               offset = field->offset + field->size;
> +       }
> +
> +       if (offset < key_size &&
> +           memcmp(key + offset, tgt + offset, key_size - offset))
> +               return false;
> +
> +       return true;
> +}
> +
> +static inline bool htab_is_same_key(const void *key, const void *tgt, unsigned int key_size,
> +                                   const struct btf_record *rec)
> +{
> +       if (!rec)
> +               return !memcmp(key, tgt, key_size);
> +       return is_same_dynptr_key(key, tgt, key_size, rec);
> +}
> +
>  /* this lookup function can only be called with bucket lock taken */
>  static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
> -                                        void *key, u32 key_size)
> +                                        void *key, u32 key_size,
> +                                        const struct btf_record *record)
>  {
>         struct hlist_nulls_node *n;
>         struct htab_elem *l;
>
>         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
> -               if (l->hash == hash && !memcmp(&l->key, key, key_size))
> +               if (l->hash == hash && htab_is_same_key(l->key, key, key_size, record))
>                         return l;
>
>         return NULL;
> @@ -657,14 +769,15 @@ static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash
>   */
>  static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
>                                                u32 hash, void *key,
> -                                              u32 key_size, u32 n_buckets)
> +                                              u32 key_size, u32 n_buckets,
> +                                              const struct btf_record *record)
>  {
>         struct hlist_nulls_node *n;
>         struct htab_elem *l;
>
>  again:
>         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
> -               if (l->hash == hash && !memcmp(&l->key, key, key_size))
> +               if (l->hash == hash && htab_is_same_key(l->key, key, key_size, record))
>                         return l;

If I'm reading this correctly the support for dynptr in map keys
adds two map->key_record != NULL checks in the fast path,
hence what you said in cover letter:

> It seems adding dynptr key support in hash map degrades the lookup
> performance about 12% and degrades the update performance about 7%. Will
> investigate these degradation first.
>
> a) lookup
> max_entries = 8K
>
> before:
> 0:hash_lookup 72347325 lookups per sec
>
> after:
> 0:hash_lookup 64758890 lookups per sec

is surprising.

Two conditional branches contribute to 12% performance loss?
Something fishy.
Try unlikely() to hopefully recover most of it.
After analyzing 'perf report/annotate', of course.

Worst case we can specialize htab_map_gen_lookup() to
call into different helpers.





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux