Hi, On 12/22/2023 8:21 PM, Philo Lu wrote: > BPF_MAP_TYPE_RELAY is implemented based on relay interface, which > creates per-cpu buffer to transfer data. Each buffer is essentially a > list of fix-sized sub-buffers, and is exposed to user space as files in > debugfs. attr->max_entries is used as subbuf size and attr->map_extra is > used as subbuf num. Currently, the default value of subbuf num is 8. > > The data can be accessed by read or mmap via these files. For example, > if there are 2 cpus, files could be `/sys/kernel/debug/mydir/my_rmap0` > and `/sys/kernel/debug/mydir/my_rmap1`. > > Buffer-only mode is used to create the relay map, which just allocates > the buffer without creating user-space files. Then user can setup the > files with map_update_elem, thus allowing user to define the directory > name in debugfs. map_update_elem is implemented in the following patch. > > A new map flag named BPF_F_OVERWRITE is introduced to set overwrite mode > of relay map. Beside adding a new map type, could we consider only use kfuncs to support the creation of rchan and the write of rchan ? I think bpf_cpumask will be a good reference. > > Signed-off-by: Philo Lu <lulie@xxxxxxxxxxxxxxxxx> > --- > include/linux/bpf_types.h | 3 + > include/uapi/linux/bpf.h | 7 ++ > kernel/bpf/Makefile | 3 + > kernel/bpf/relaymap.c | 157 ++++++++++++++++++++++++++++++++++++++ > kernel/bpf/syscall.c | 1 + > 5 files changed, 171 insertions(+) > create mode 100644 kernel/bpf/relaymap.c > SNIP > diff --git a/kernel/bpf/relaymap.c b/kernel/bpf/relaymap.c > new file mode 100644 > index 000000000000..d0adc7f67758 > --- /dev/null > +++ b/kernel/bpf/relaymap.c > @@ -0,0 +1,157 @@ > +#include <linux/cpumask.h> > +#include <linux/debugfs.h> > +#include <linux/filter.h> > +#include <linux/relay.h> > +#include <linux/slab.h> > +#include <linux/bpf.h> > +#include <linux/err.h> > + > +#define RELAY_CREATE_FLAG_MASK (BPF_F_OVERWRITE) > + > +struct bpf_relay_map { > + struct bpf_map map; > + struct rchan *relay_chan; > + struct rchan_callbacks relay_cb; > +}; It seems that there is no need to add relay_cb in bpf_relay_map. We could define two kinds of rchan_callbacks: one for non-overwrite mode and another one for overwrite mode. > + > +static struct dentry *create_buf_file_handler(const char *filename, > + struct dentry *parent, umode_t mode, > + struct rchan_buf *buf, int *is_global) > +{ > + /* Because we do relay_late_setup_files(), create_buf_file(NULL, NULL, ...) > + * will be called by relay_open. > + */ > + if (!filename) > + return NULL; > + > + return debugfs_create_file(filename, mode, parent, buf, > + &relay_file_operations); > +} > + > +static int remove_buf_file_handler(struct dentry *dentry) > +{ > + debugfs_remove(dentry); > + return 0; > +} > + > +/* For non-overwrite, use default subbuf_start cb */ > +static int subbuf_start_overwrite(struct rchan_buf *buf, void *subbuf, > + void *prev_subbuf, size_t prev_padding) > +{ > + return 1; > +} > + > +/* bpf_attr is used as follows: > + * - key size: must be 0 > + * - value size: value will be used as directory name by map_update_elem > + * (to create relay files). If passed as 0, it will be set to NAME_MAX as > + * default > + * > + * - max_entries: subbuf size > + * - map_extra: subbuf num, default as 8 > + * > + * When alloc, we do not set up relay files considering dir_name conflicts. > + * Instead we use relay_late_setup_files() in map_update_elem(), and thus the > + * value is used as dir_name, and map->name is used as base_filename. > + */ > +static struct bpf_map *relay_map_alloc(union bpf_attr *attr) > +{ > + struct bpf_relay_map *rmap; > + > + if (unlikely(attr->map_flags & ~RELAY_CREATE_FLAG_MASK)) > + return ERR_PTR(-EINVAL); > + > + /* key size must be 0 in relay map */ > + if (unlikely(attr->key_size)) > + return ERR_PTR(-EINVAL); > + > + if (unlikely(attr->value_size > NAME_MAX)) { > + pr_warn("value_size should be no more than %d\n", NAME_MAX); > + return ERR_PTR(-EINVAL); > + } else if (attr->value_size == 0) > + attr->value_size = NAME_MAX; > + > + /* set default subbuf num */ > + attr->map_extra = attr->map_extra & UINT_MAX; Should we reject invalid map_extra and return -EINVAL instead ? > + if (!attr->map_extra) > + attr->map_extra = 8; > + > + if (!attr->map_name || strlen(attr->map_name) == 0) > + return ERR_PTR(-EINVAL); > + > + rmap = bpf_map_area_alloc(sizeof(*rmap), NUMA_NO_NODE); > + if (!rmap) > + return ERR_PTR(-ENOMEM); > + > + bpf_map_init_from_attr(&rmap->map, attr); > + > + rmap->relay_cb.create_buf_file = create_buf_file_handler; > + rmap->relay_cb.remove_buf_file = remove_buf_file_handler; > + if (attr->map_flags & BPF_F_OVERWRITE) > + rmap->relay_cb.subbuf_start = subbuf_start_overwrite; > + > + rmap->relay_chan = relay_open(NULL, NULL, > + attr->max_entries, attr->map_extra, > + &rmap->relay_cb, NULL); > + if (!rmap->relay_chan) > + return ERR_PTR(-EINVAL); Need to free rmap. > + > + return &rmap->map; > +} > + > +static void relay_map_free(struct bpf_map *map) > +{ > + struct bpf_relay_map *rmap; > + > + rmap = container_of(map, struct bpf_relay_map, map); > + relay_close(rmap->relay_chan); > + debugfs_remove_recursive(rmap->relay_chan->parent); rmap->relay_chan may be freed by relay_close(), so it is not safe to dereference relay_chan->parent here. And is debugfs_remove_recursive() necessary here ? > + kfree(rmap); > +} > + > +static void *relay_map_lookup_elem(struct bpf_map *map, void *key) > +{ > + return ERR_PTR(-EOPNOTSUPP); > +} > + > +static long relay_map_update_elem(struct bpf_map *map, void *key, void *value, > + u64 flags) > +{ > + return -EOPNOTSUPP; > +} > + > +static long relay_map_delete_elem(struct bpf_map *map, void *key) > +{ > + return -EOPNOTSUPP; > +} > + > +static int relay_map_get_next_key(struct bpf_map *map, void *key, > + void *next_key) > +{ > + return -EOPNOTSUPP; > +} > + > +static u64 relay_map_mem_usage(const struct bpf_map *map) > +{ > + struct bpf_relay_map *rmap; > + u64 usage = sizeof(struct bpf_relay_map); > + > + rmap = container_of(map, struct bpf_relay_map, map); > + usage += sizeof(struct rchan); > + usage += (sizeof(struct rchan_buf) + rmap->relay_chan->alloc_size) > + * num_online_cpus(); > + return usage; > +} > + > +BTF_ID_LIST_SINGLE(relay_map_btf_ids, struct, bpf_relay_map) > +const struct bpf_map_ops relay_map_ops = { > + .map_meta_equal = bpf_map_meta_equal, > + .map_alloc = relay_map_alloc, > + .map_free = relay_map_free, > + .map_lookup_elem = relay_map_lookup_elem, > + .map_update_elem = relay_map_update_elem, > + .map_delete_elem = relay_map_delete_elem, > + .map_get_next_key = relay_map_get_next_key, > + .map_mem_usage = relay_map_mem_usage, > + .map_btf_id = &relay_map_btf_ids[0], > +}; >