From: Cong Wang <cong.wang@xxxxxxxxxxxxx> Insert: bpf_map_update(&map, &key, &val, flag); Delete a specific key-val pair: bpf_map_delete_elem(&map, &key); Pop the minimum one: bpf_map_pop(&map, &val); Lookup: val = bpf_map_lookup_elem(&map, &key); Iterator: bpf_for_each_map_elem(&map, callback, key, val); Signed-off-by: Cong Wang <cong.wang@xxxxxxxxxxxxx> --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 2 +- kernel/bpf/rbtree.c | 445 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 448 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/rbtree.c diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2c6a4f2562a7..c53ba6de1613 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -127,6 +127,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_RBTREE, rbtree_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 51b9aa640ad2..9492cd3af701 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -935,6 +935,7 @@ enum bpf_map_type { BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, + BPF_MAP_TYPE_RBTREE, }; /* Note that tracing related programs such as diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 341c94f208f4..e60249258c74 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -7,7 +7,7 @@ endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o rbtree.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o diff --git a/kernel/bpf/rbtree.c b/kernel/bpf/rbtree.c new file mode 100644 index 000000000000..f1a9b1c40b8b --- /dev/null +++ b/kernel/bpf/rbtree.c @@ -0,0 +1,445 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * rbtree.c: eBPF rbtree map + * + * Copyright (C) 2022, ByteDance, Cong Wang <cong.wang@xxxxxxxxxxxxx> + */ +#include <linux/bpf.h> +#include <linux/slab.h> +#include <linux/capability.h> +#include <linux/rbtree.h> +#include <linux/btf_ids.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/math.h> +#include <linux/seq_file.h> + +#define RBTREE_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) + +/* each rbtree element is struct rbtree_elem + key + value */ +struct rbtree_elem { + struct rb_node rbnode; + char key[] __aligned(8); +}; + +struct rbtree_map { + struct bpf_map map; + struct bpf_mem_alloc ma; + raw_spinlock_t lock; + struct rb_root root; + atomic_t nr_entries; +}; + +#define rb_to_elem(rb) rb_entry_safe(rb, struct rbtree_elem, rbnode) +#define elem_rb_first(root) rb_to_elem(rb_first(root)) +#define elem_rb_last(root) rb_to_elem(rb_last(root)) +#define elem_rb_next(e) rb_to_elem(rb_next(&(e)->rbnode)) +#define rbtree_walk_safe(e, tmp, root) \ + for (e = elem_rb_first(root); \ + tmp = e ? elem_rb_next(e) : NULL, (e != NULL); \ + e = tmp) + +static struct rbtree_map *rbtree_map(struct bpf_map *map) +{ + return container_of(map, struct rbtree_map, map); +} + +/* Called from syscall */ +static int rbtree_map_alloc_check(union bpf_attr *attr) +{ + if (!bpf_capable()) + return -EPERM; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || + attr->map_flags & ~RBTREE_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) + return -EINVAL; + + if (attr->value_size > KMALLOC_MAX_SIZE) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return -E2BIG; + + return 0; +} + +static struct bpf_map *rbtree_map_alloc(union bpf_attr *attr) +{ + int numa_node = bpf_map_attr_numa_node(attr); + struct rbtree_map *rb; + u32 elem_size; + int err; + + rb = bpf_map_area_alloc(sizeof(*rb), numa_node); + if (!rb) + return ERR_PTR(-ENOMEM); + + memset(rb, 0, sizeof(*rb)); + bpf_map_init_from_attr(&rb->map, attr); + raw_spin_lock_init(&rb->lock); + rb->root = RB_ROOT; + atomic_set(&rb->nr_entries, 0); + + elem_size = sizeof(struct rbtree_elem) + + round_up(rb->map.key_size, 8); + elem_size += round_up(rb->map.value_size, 8); + err = bpf_mem_alloc_init(&rb->ma, elem_size, false); + if (err) { + bpf_map_area_free(rb); + return ERR_PTR(err); + } + return &rb->map; +} + +static void check_and_free_fields(struct rbtree_map *rb, + struct rbtree_elem *elem) +{ + void *map_value = elem->key + round_up(rb->map.key_size, 8); + + if (map_value_has_kptrs(&rb->map)) + bpf_map_free_kptrs(&rb->map, map_value); +} + +static void rbtree_map_purge(struct bpf_map *map) +{ + struct rbtree_map *rb = rbtree_map(map); + struct rbtree_elem *e, *tmp; + + rbtree_walk_safe(e, tmp, &rb->root) { + rb_erase(&e->rbnode, &rb->root); + check_and_free_fields(rb, e); + bpf_mem_cache_free(&rb->ma, e); + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void rbtree_map_free(struct bpf_map *map) +{ + struct rbtree_map *rb = rbtree_map(map); + unsigned long flags; + + raw_spin_lock_irqsave(&rb->lock, flags); + rbtree_map_purge(map); + raw_spin_unlock_irqrestore(&rb->lock, flags); + bpf_mem_alloc_destroy(&rb->ma); + bpf_map_area_free(rb); +} + +static struct rbtree_elem *bpf_rbtree_find(struct rb_root *root, void *key, int size) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct rbtree_elem *e; + + while (*p) { + int ret; + + parent = *p; + e = rb_to_elem(parent); + ret = memcmp(key, e->key, size); + if (ret < 0) + p = &parent->rb_left; + else if (ret > 0) + p = &parent->rb_right; + else + return e; + } + return NULL; +} + +/* Called from eBPF program or syscall */ +static void *rbtree_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct rbtree_map *rb = rbtree_map(map); + struct rbtree_elem *e; + + e = bpf_rbtree_find(&rb->root, key, rb->map.key_size); + if (!e) + return NULL; + return e->key + round_up(rb->map.key_size, 8); +} + +static int check_flags(struct rbtree_elem *old, u64 map_flags) +{ + if (old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + +static void rbtree_map_insert(struct rbtree_map *rb, struct rbtree_elem *e) +{ + struct rb_root *root = &rb->root; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct rbtree_elem *e1; + + while (*p) { + parent = *p; + e1 = rb_to_elem(parent); + if (memcmp(e->key, e1->key, rb->map.key_size) < 0) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&e->rbnode, parent, p); + rb_insert_color(&e->rbnode, root); +} + +/* Called from syscall or from eBPF program */ +static int rbtree_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct rbtree_map *rb = rbtree_map(map); + void *val = rbtree_map_lookup_elem(map, key); + int ret; + + ret = check_flags(val, map_flags); + if (ret) + return ret; + + if (!val) { + struct rbtree_elem *e_new; + unsigned long flags; + + e_new = bpf_mem_cache_alloc(&rb->ma); + if (!e_new) + return -ENOMEM; + val = e_new->key + round_up(rb->map.key_size, 8); + check_and_init_map_value(&rb->map, val); + memcpy(e_new->key, key, rb->map.key_size); + raw_spin_lock_irqsave(&rb->lock, flags); + rbtree_map_insert(rb, e_new); + raw_spin_unlock_irqrestore(&rb->lock, flags); + atomic_inc(&rb->nr_entries); + } + + if (map_flags & BPF_F_LOCK) + copy_map_value_locked(map, val, value, false); + else + copy_map_value(map, val, value); + return 0; +} + +/* Called from syscall or from eBPF program */ +static int rbtree_map_delete_elem(struct bpf_map *map, void *key) +{ + struct rbtree_map *rb = rbtree_map(map); + struct rbtree_elem *e; + unsigned long flags; + + raw_spin_lock_irqsave(&rb->lock, flags); + e = bpf_rbtree_find(&rb->root, key, rb->map.key_size); + if (!e) { + raw_spin_unlock_irqrestore(&rb->lock, flags); + return -ENOENT; + } + rb_erase(&e->rbnode, &rb->root); + raw_spin_unlock_irqrestore(&rb->lock, flags); + check_and_free_fields(rb, e); + bpf_mem_cache_free(&rb->ma, e); + atomic_dec(&rb->nr_entries); + return 0; +} + +/* Called from syscall or from eBPF program */ +static int rbtree_map_pop_elem(struct bpf_map *map, void *value) +{ + struct rbtree_map *rb = rbtree_map(map); + struct rbtree_elem *e = elem_rb_first(&rb->root); + unsigned long flags; + void *val; + + if (!e) + return -ENOENT; + raw_spin_lock_irqsave(&rb->lock, flags); + rb_erase(&e->rbnode, &rb->root); + raw_spin_unlock_irqrestore(&rb->lock, flags); + val = e->key + round_up(rb->map.key_size, 8); + copy_map_value(map, value, val); + check_and_free_fields(rb, e); + bpf_mem_cache_free(&rb->ma, e); + atomic_dec(&rb->nr_entries); + return 0; +} + +/* Called from syscall */ +static int rbtree_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct rbtree_map *rb = rbtree_map(map); + struct rbtree_elem *e; + + if (!key) { + e = elem_rb_first(&rb->root); + if (!e) + return -ENOENT; + goto found; + } + e = bpf_rbtree_find(&rb->root, key, rb->map.key_size); + if (!e) + return -ENOENT; + e = elem_rb_next(e); + if (!e) + return 0; +found: + memcpy(next_key, e->key, map->key_size); + return 0; +} + +static int bpf_for_each_rbtree_map(struct bpf_map *map, + bpf_callback_t callback_fn, + void *callback_ctx, u64 flags) +{ + struct rbtree_map *rb = rbtree_map(map); + struct rbtree_elem *e, *tmp; + void *key, *value; + u32 num_elems = 0; + u64 ret = 0; + + if (flags != 0) + return -EINVAL; + + rbtree_walk_safe(e, tmp, &rb->root) { + num_elems++; + key = e->key; + value = key + round_up(rb->map.key_size, 8); + ret = callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, + (u64)(long)callback_ctx, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) + break; + } + + return num_elems; +} + +struct rbtree_map_seq_info { + struct bpf_map *map; + struct rbtree_map *rb; +}; + +static void *rbtree_map_seq_find_next(struct rbtree_map_seq_info *info, + struct rbtree_elem *prev_elem) +{ + const struct rbtree_map *rb = info->rb; + struct rbtree_elem *elem; + + /* try to find next elem in the same bucket */ + if (prev_elem) { + elem = elem_rb_next(prev_elem); + if (elem) + return elem; + return NULL; + } + + return elem_rb_first(&rb->root); +} + +static void *rbtree_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct rbtree_map_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with rbtree_map_seq_stop */ + rcu_read_lock(); + return rbtree_map_seq_find_next(info, NULL); +} + +static void *rbtree_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct rbtree_map_seq_info *info = seq->private; + + ++*pos; + return rbtree_map_seq_find_next(info, v); +} + +static int rbtree_map_seq_show(struct seq_file *seq, void *v) +{ + struct rbtree_map_seq_info *info = seq->private; + struct bpf_iter__bpf_map_elem ctx = {}; + struct rbtree_elem *elem = v; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !elem); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + ctx.key = elem->key; + ctx.value = elem->key + round_up(info->map->key_size, 8); + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void rbtree_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)rbtree_map_seq_show(seq, NULL); + + /* pairs with rbtree_map_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations rbtree_map_seq_ops = { + .start = rbtree_map_seq_start, + .next = rbtree_map_seq_next, + .stop = rbtree_map_seq_stop, + .show = rbtree_map_seq_show, +}; + +static int rbtree_map_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct rbtree_map_seq_info *info = priv_data; + + bpf_map_inc_with_uref(aux->map); + info->map = aux->map; + info->rb = rbtree_map(info->map); + return 0; +} + +static void rbtree_map_fini_seq_private(void *priv_data) +{ + struct rbtree_map_seq_info *info = priv_data; + + bpf_map_put_with_uref(info->map); +} + +static const struct bpf_iter_seq_info rbtree_map_iter_seq_info = { + .seq_ops = &rbtree_map_seq_ops, + .init_seq_private = rbtree_map_init_seq_private, + .fini_seq_private = rbtree_map_fini_seq_private, + .seq_priv_size = sizeof(struct rbtree_map_seq_info), +}; + +BTF_ID_LIST_SINGLE(rbtree_map_btf_ids, struct, rbtree_map) +const struct bpf_map_ops rbtree_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = rbtree_map_alloc_check, + .map_alloc = rbtree_map_alloc, + .map_free = rbtree_map_free, + .map_lookup_elem = rbtree_map_lookup_elem, + .map_update_elem = rbtree_map_update_elem, + .map_delete_elem = rbtree_map_delete_elem, + .map_pop_elem = rbtree_map_pop_elem, + .map_get_next_key = rbtree_map_get_next_key, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_rbtree_map, + .map_btf_id = &rbtree_map_btf_ids[0], + .iter_seq_info = &rbtree_map_iter_seq_info, +}; + -- 2.34.1