BPF_MAP_TYPE_RELAY is implemented based on relay interface, which creates per-cpu buffer to transfer data. Each buffer is essentially a list of fix-sized sub-buffers, and is exposed to user space as files in debugfs. All of these compose a "relay chanel", which is the kernel of a relay map. Currently, attr->max_entries is used as subbuf size and attr->map_extra is used as subbuf num, and the default value of subbuf num is 8. A new map flag named BPF_F_OVERWRITE is also introduced to set overwrite mode of relay map. A relay map is represented as a directory in debugfs, and the per-cpu buffers are files in this directory. Users can get the data through read or mmap. To avoid directory name conflicting, relay_map_update_elem is provided to set the name. In fact, we create the relay channel and buffers with BPF_MAP_CREATE, and create relay files and bind them with the channel using BPF_MAP_UPDATE_ELEM. Generally, map_update_elem should be called once and only once. Here is an example: ``` struct { __uint(type, BPF_MAP_TYPE_RELAY); __uint(max_entries, 4096); } my_relay SEC(".maps"); ... char dir_name[] = "relay_test"; bpf_map_update_elem(map_fd, NULL, dir_name, 0); ``` Assume there are 2 cpus, we will have 2 files: ``` /sys/kerenl/debug/relay_test/my_relay0 /sys/kerenl/debug/relay_test/my_relay1 ``` Each represents a per-cpu buffer with size 8 * 4096 B (there are 8 subbufs by default, each with size 4096B). Signed-off-by: Philo Lu <lulie@xxxxxxxxxxxxxxxxx> --- include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 7 ++ kernel/bpf/Makefile | 3 + kernel/bpf/relaymap.c | 199 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 2 + 5 files changed, 214 insertions(+) create mode 100644 kernel/bpf/relaymap.c diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fc0d6f32c687..c122d7b494c5 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -132,6 +132,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) +#ifdef CONFIG_RELAY +BPF_MAP_TYPE(BPF_MAP_TYPE_RELAY, relay_map_ops) +#endif BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 754e68ca8744..143b75676bd3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -951,6 +951,7 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, + BPF_MAP_TYPE_RELAY, }; /* Note that tracing related programs such as @@ -1330,6 +1331,9 @@ enum { /* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */ BPF_F_PATH_FD = (1U << 14), + +/* Enable overwrite for relay map */ + BPF_F_OVERWRITE = (1U << 15), }; /* Flags for BPF_PROG_QUERY. */ @@ -1401,6 +1405,9 @@ union bpf_attr { * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the * number of hash functions (if 0, the bloom filter will default * to using 5 hash functions). + * + * BPF_MAP_TYPE_RELAY - the lowest 32 bits indicate the number of + * relay subbufs (if 0, the number will be set to 8 by default). */ __u64 map_extra; }; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f526b7573e97..45b35bb0e572 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -10,6 +10,9 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +ifeq ($(CONFIG_RELAY),y) +obj-$(CONFIG_BPF_SYSCALL) += relaymap.o +endif obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o diff --git a/kernel/bpf/relaymap.c b/kernel/bpf/relaymap.c new file mode 100644 index 000000000000..02b33a8e6b6c --- /dev/null +++ b/kernel/bpf/relaymap.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpumask.h> +#include <linux/debugfs.h> +#include <linux/filter.h> +#include <linux/relay.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/err.h> + +#define RELAY_CREATE_FLAG_MASK (BPF_F_OVERWRITE) + +struct bpf_relay_map { + struct bpf_map map; + struct rchan *relay_chan; + struct rchan_callbacks relay_cb; +}; + +static struct dentry *create_buf_file_handler(const char *filename, + struct dentry *parent, umode_t mode, + struct rchan_buf *buf, int *is_global) +{ + /* Because we do relay_late_setup_files(), create_buf_file(NULL, NULL, ...) + * will be called by relay_open. + */ + if (!filename) + return NULL; + + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +static int remove_buf_file_handler(struct dentry *dentry) +{ + debugfs_remove(dentry); + return 0; +} + +/* For non-overwrite, use default subbuf_start cb */ +static int subbuf_start_overwrite(struct rchan_buf *buf, void *subbuf, + void *prev_subbuf, size_t prev_padding) +{ + return 1; +} + +/* bpf_attr is used as follows: + * - key size: must be 0 + * - value size: value will be used as directory name by map_update_elem + * (to create relay files). If passed as 0, it will be set to NAME_MAX as + * default + * + * - max_entries: subbuf size + * - map_extra: subbuf num, default as 8 + * + * When alloc, we do not set up relay files considering dir_name conflicts. + * Instead we use relay_late_setup_files() in map_update_elem(), and thus the + * value is used as dir_name, and map->name is used as base_filename. + */ +static struct bpf_map *relay_map_alloc(union bpf_attr *attr) +{ + struct bpf_relay_map *rmap; + + if (unlikely(attr->map_flags & ~RELAY_CREATE_FLAG_MASK)) + return ERR_PTR(-EINVAL); + + /* key size must be 0 in relay map */ + if (unlikely(attr->key_size)) + return ERR_PTR(-EINVAL); + + /* value size is used as directory name length */ + if (unlikely(attr->value_size > NAME_MAX)) { + pr_warn("value_size should be no more than %d\n", NAME_MAX); + return ERR_PTR(-EINVAL); + } else if (attr->value_size == 0) + attr->value_size = NAME_MAX; + + /* set default subbuf num */ + if (unlikely(attr->map_extra & ~UINT_MAX)) + return ERR_PTR(-EINVAL); + attr->map_extra = attr->map_extra & UINT_MAX; + if (!attr->map_extra) + attr->map_extra = 8; + + if (strlen(attr->map_name) == 0) + return ERR_PTR(-EINVAL); + + rmap = bpf_map_area_alloc(sizeof(*rmap), NUMA_NO_NODE); + if (!rmap) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rmap->map, attr); + + rmap->relay_cb.create_buf_file = create_buf_file_handler; + rmap->relay_cb.remove_buf_file = remove_buf_file_handler; + if (attr->map_flags & BPF_F_OVERWRITE) + rmap->relay_cb.subbuf_start = subbuf_start_overwrite; + + rmap->relay_chan = relay_open(NULL, NULL, + attr->max_entries, attr->map_extra, + &rmap->relay_cb, NULL); + if (!rmap->relay_chan) { + bpf_map_area_free(rmap); + return ERR_PTR(-EINVAL); + } + + return &rmap->map; +} + +static void relay_map_free(struct bpf_map *map) +{ + struct bpf_relay_map *rmap; + struct dentry *parent; + + rmap = container_of(map, struct bpf_relay_map, map); + + parent = rmap->relay_chan->parent; + relay_close(rmap->relay_chan); + /* relay_chan->parent should be removed mannually if exists. */ + debugfs_remove_recursive(parent); + bpf_map_area_free(rmap); +} + +static void *relay_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static long relay_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + struct bpf_relay_map *rmap; + struct dentry *parent; + int err; + + if (unlikely(flags)) + return -EINVAL; + + if (unlikely(key)) + return -EINVAL; + + /* If the directory already exists, debugfs_create_dir will fail. It could + * have been created by map_update_elem before, or another system that uses + * debugfs. + * + * Note that the directory name passed as value should not be longer than + * map->value_size, including the '\0' at the end. + */ + ((char *)value)[map->value_size - 1] = '\0'; + parent = debugfs_create_dir(value, NULL); + if (IS_ERR_OR_NULL(parent)) + return PTR_ERR(parent); + + /* We don't need a lock here, because the relay channel is protected in + * relay_late_setup_files() with a mutex. + */ + rmap = container_of(map, struct bpf_relay_map, map); + err = relay_late_setup_files(rmap->relay_chan, map->name, parent); + if (err) { + debugfs_remove_recursive(parent); + return err; + } + + return 0; +} + +static long relay_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EOPNOTSUPP; +} + +static int relay_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -EOPNOTSUPP; +} + +static u64 relay_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_relay_map *rmap; + u64 usage = sizeof(struct bpf_relay_map); + + rmap = container_of(map, struct bpf_relay_map, map); + usage += sizeof(struct rchan); + usage += (sizeof(struct rchan_buf) + rmap->relay_chan->alloc_size) + * num_online_cpus(); + return usage; +} + +BTF_ID_LIST_SINGLE(relay_map_btf_ids, struct, bpf_relay_map) +const struct bpf_map_ops relay_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = relay_map_alloc, + .map_free = relay_map_free, + .map_lookup_elem = relay_map_lookup_elem, + .map_update_elem = relay_map_update_elem, + .map_delete_elem = relay_map_delete_elem, + .map_get_next_key = relay_map_get_next_key, + .map_mem_usage = relay_map_mem_usage, + .map_btf_id = &relay_map_btf_ids[0], +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1bf9805ee185..d6b7949e29c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1147,6 +1147,7 @@ static int map_create(union bpf_attr *attr) } if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && + attr->map_type != BPF_MAP_TYPE_RELAY && attr->map_extra != 0) return -EINVAL; @@ -1202,6 +1203,7 @@ static int map_create(union bpf_attr *attr) case BPF_MAP_TYPE_USER_RINGBUF: case BPF_MAP_TYPE_CGROUP_STORAGE: case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: + case BPF_MAP_TYPE_RELAY: /* unprivileged */ break; case BPF_MAP_TYPE_SK_STORAGE: -- 2.32.0.3.g01195cf9f