On Tue, Oct 8, 2024 at 2:02 AM Hou Tao <houtao@xxxxxxxxxxxxxxx> wrote: > > From: Hou Tao <houtao1@xxxxxxxxxx> > > The patch supports lookup, update, delete and lookup_delete operations > for hash map with dynptr map. There are two major differences between > the implementation of normal hash map and dynptr-keyed hash map: > > 1) dynptr-keyed hash map doesn't support pre-allocation. > The reason is that the dynptr in map key is allocated dynamically > through bpf mem allocator. The length limitation for these dynptrs is > 4088 bytes now. Because there dynptrs are allocated dynamically, the > consumption of memory will be smaller compared with normal hash map when > there are big differences between the length of these dynptrs. > > 2) the freed element in dynptr-key map will not be reused immediately > For normal hash map, the freed element may be reused immediately by the > newly-added element, so the lookup may return an incorrect result due to > element deletion and element reuse. However dynptr-key map could not do > that, there are pointers (dynptrs) in the map key and the updates of > these dynptrs are not atomic: both the address and the length of the > dynptr will be updated. If the element is reused immediately, the access > of the dynptr in the freed element may incur invalid memory access due > to the mismatch between the address and the size of dynptr, so reuse the > freed element after one RCU grace period. > > Beside the differences above, dynptr-keyed hash map also needs to handle > the maybe-nullified dynptr in the map key. > > Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx> > --- > kernel/bpf/hashtab.c | 283 +++++++++++++++++++++++++++++++++++++++---- > 1 file changed, 257 insertions(+), 26 deletions(-) > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c > index b14b87463ee0..edf19d36a413 100644 > --- a/kernel/bpf/hashtab.c > +++ b/kernel/bpf/hashtab.c > @@ -88,6 +88,7 @@ struct bpf_htab { > struct bpf_map map; > struct bpf_mem_alloc ma; > struct bpf_mem_alloc pcpu_ma; > + struct bpf_mem_alloc dynptr_ma; > struct bucket *buckets; > void *elems; > union { > @@ -425,6 +426,7 @@ static int htab_map_alloc_check(union bpf_attr *attr) > bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); > bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); > bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); > + bool dynptr_in_key = (attr->map_flags & BPF_F_DYNPTR_IN_KEY); > int numa_node = bpf_map_attr_numa_node(attr); > > BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != > @@ -438,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr) > !bpf_map_flags_access_ok(attr->map_flags)) > return -EINVAL; > > + if (dynptr_in_key) { > + if (percpu || lru || prealloc || !attr->map_extra) > + return -EINVAL; > + if ((attr->map_extra >> 32) || bpf_dynptr_check_size(attr->map_extra) || > + bpf_mem_alloc_check_size(percpu, attr->map_extra)) > + return -E2BIG; > + } > + > if (!lru && percpu_lru) > return -EINVAL; > > @@ -482,6 +492,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) > */ > bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); > bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); > + bool dynptr_in_key = (attr->map_flags & BPF_F_DYNPTR_IN_KEY); > struct bpf_htab *htab; > int err, i; > > @@ -598,6 +609,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) > if (err) > goto free_map_locked; > } > + if (dynptr_in_key) { > + err = bpf_mem_alloc_init(&htab->dynptr_ma, 0, false); > + if (err) > + goto free_map_locked; > + } > } > > return &htab->map; > @@ -610,6 +626,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) > for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) > free_percpu(htab->map_locked[i]); > bpf_map_area_free(htab->buckets); > + bpf_mem_alloc_destroy(&htab->dynptr_ma); > bpf_mem_alloc_destroy(&htab->pcpu_ma); > bpf_mem_alloc_destroy(&htab->ma); > free_elem_count: > @@ -620,13 +637,55 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) > return ERR_PTR(err); > } > > -static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) > +static inline u32 __htab_map_hash(const void *key, u32 key_len, u32 hashrnd) > { > if (likely(key_len % 4 == 0)) > return jhash2(key, key_len / 4, hashrnd); > return jhash(key, key_len, hashrnd); > } > > +static u32 htab_map_dynptr_hash(const void *key, u32 key_len, u32 hashrnd, > + const struct btf_record *rec) > +{ > + unsigned int i, cnt = rec->cnt; > + unsigned int hash = hashrnd; > + unsigned int offset = 0; > + > + for (i = 0; i < cnt; i++) { > + const struct btf_field *field = &rec->fields[i]; > + const struct bpf_dynptr_kern *kptr; > + unsigned int len; > + > + if (field->type != BPF_DYNPTR) > + continue; > + > + /* non-dynptr part ? */ > + if (offset < field->offset) > + hash = jhash(key + offset, field->offset - offset, hash); > + > + /* Skip nullified dynptr */ > + kptr = key + field->offset; > + if (kptr->data) { > + len = __bpf_dynptr_size(kptr); > + hash = jhash(__bpf_dynptr_data(kptr, len), len, hash); > + } > + offset = field->offset + field->size; > + } > + > + if (offset < key_len) > + hash = jhash(key + offset, key_len - offset, hash); > + > + return hash; > +} > + > +static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd, > + const struct btf_record *rec) > +{ > + if (!rec) > + return __htab_map_hash(key, key_len, hashrnd); > + return htab_map_dynptr_hash(key, key_len, hashrnd, rec); > +} > + > static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) > { > return &htab->buckets[hash & (htab->n_buckets - 1)]; > @@ -637,15 +696,68 @@ static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 > return &__select_bucket(htab, hash)->head; > } > > +static bool is_same_dynptr_key(const void *key, const void *tgt, unsigned int key_size, > + const struct btf_record *rec) > +{ > + unsigned int i, cnt = rec->cnt; > + unsigned int offset = 0; > + > + for (i = 0; i < cnt; i++) { > + const struct btf_field *field = &rec->fields[i]; > + const struct bpf_dynptr_kern *kptr, *tgt_kptr; > + const void *data, *tgt_data; > + unsigned int len; > + > + if (field->type != BPF_DYNPTR) > + continue; > + > + if (offset < field->offset && > + memcmp(key + offset, tgt + offset, field->offset - offset)) > + return false; > + > + /* > + * For a nullified dynptr in the target key, __bpf_dynptr_size() > + * will return 0, and there will be no match for the target key. > + */ > + kptr = key + field->offset; > + tgt_kptr = tgt + field->offset; > + len = __bpf_dynptr_size(kptr); > + if (len != __bpf_dynptr_size(tgt_kptr)) > + return false; > + > + data = __bpf_dynptr_data(kptr, len); > + tgt_data = __bpf_dynptr_data(tgt_kptr, len); > + if (memcmp(data, tgt_data, len)) > + return false; > + > + offset = field->offset + field->size; > + } > + > + if (offset < key_size && > + memcmp(key + offset, tgt + offset, key_size - offset)) > + return false; > + > + return true; > +} > + > +static inline bool htab_is_same_key(const void *key, const void *tgt, unsigned int key_size, > + const struct btf_record *rec) > +{ > + if (!rec) > + return !memcmp(key, tgt, key_size); > + return is_same_dynptr_key(key, tgt, key_size, rec); > +} > + > /* this lookup function can only be called with bucket lock taken */ > static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, > - void *key, u32 key_size) > + void *key, u32 key_size, > + const struct btf_record *record) > { > struct hlist_nulls_node *n; > struct htab_elem *l; > > hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) > - if (l->hash == hash && !memcmp(&l->key, key, key_size)) > + if (l->hash == hash && htab_is_same_key(l->key, key, key_size, record)) > return l; > > return NULL; > @@ -657,14 +769,15 @@ static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash > */ > static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, > u32 hash, void *key, > - u32 key_size, u32 n_buckets) > + u32 key_size, u32 n_buckets, > + const struct btf_record *record) > { > struct hlist_nulls_node *n; > struct htab_elem *l; > > again: > hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) > - if (l->hash == hash && !memcmp(&l->key, key, key_size)) > + if (l->hash == hash && htab_is_same_key(l->key, key, key_size, record)) > return l; If I'm reading this correctly the support for dynptr in map keys adds two map->key_record != NULL checks in the fast path, hence what you said in cover letter: > It seems adding dynptr key support in hash map degrades the lookup > performance about 12% and degrades the update performance about 7%. Will > investigate these degradation first. > > a) lookup > max_entries = 8K > > before: > 0:hash_lookup 72347325 lookups per sec > > after: > 0:hash_lookup 64758890 lookups per sec is surprising. Two conditional branches contribute to 12% performance loss? Something fishy. Try unlikely() to hopefully recover most of it. After analyzing 'perf report/annotate', of course. Worst case we can specialize htab_map_gen_lookup() to call into different helpers.