From: Hou Tao <houtao1@xxxxxxxxxx> The following patch will move the invocation of check_and_free_fields() in htab_map_update_elem() outside of the bucket lock. However, the reason why the bucket lock is necessary is that the overwritten element has already been stashed in htab->extra_elems when alloc_htab_elem() returns. If invoking check_and_free_fields() after the bucket lock is unlocked, the stashed element may be reused by concurrent update procedure and the freeing in check_and_free_fields() will run concurrently with the reuse and lead to bugs. The fix breaks the reuse and stash of extra_elems into two steps: 1) reuse the per-cpu extra_elems with bucket lock being held. 2) refill per-cpu extra_elems after unlock bucket lock. This patch adds support for stashing per-cpu extra_elems after bucket lock is unlocked. The refill may run concurrently, therefore, cmpxchg_release() is used. _release semantics is necessary to ensure the freeing of ptrs or special fields in the map value is completed before the element is reused by concurrent update process. Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx> --- kernel/bpf/hashtab.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 903447a340d3..3c6eebabb492 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -946,14 +946,28 @@ static void dec_elem_count(struct bpf_htab *htab) atomic_dec(&htab->count); } - -static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l, bool refill_extra) { htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { - bpf_map_dec_elem_count(&htab->map); check_and_free_fields(htab, l); + + if (refill_extra) { + struct htab_elem **extra; + + /* Use cmpxchg_release() to ensure the freeing of ptrs + * or special fields in map value is completed when the + * update procedure reuses the extra element. It will + * pair with smp_load_acquire() when reading extra_elems + * pointer. + */ + extra = this_cpu_ptr(htab->extra_elems); + if (cmpxchg_release(extra, NULL, l) == NULL) + return; + } + + bpf_map_dec_elem_count(&htab->map); pcpu_freelist_push(&htab->freelist, &l->fnode); } else { dec_elem_count(htab); @@ -1207,7 +1221,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (old_map_ptr) map->ops->map_fd_put_ptr(map, old_map_ptr, true); if (!htab_is_prealloc(htab)) - free_htab_elem(htab, l_old); + free_htab_elem(htab, l_old, false); } return 0; err: @@ -1461,7 +1475,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) htab_unlock_bucket(htab, b, hash, flags); if (l) - free_htab_elem(htab, l); + free_htab_elem(htab, l, false); return ret; } @@ -1677,7 +1691,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, if (is_lru_map) htab_lru_push_free(htab, l); else - free_htab_elem(htab, l); + free_htab_elem(htab, l, false); } return ret; @@ -1899,7 +1913,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, if (is_lru_map) htab_lru_push_free(htab, l); else - free_htab_elem(htab, l); + free_htab_elem(htab, l, false); } next_batch: -- 2.29.2