Re: [PATCH bpf-next v1 1/3] bpf: implement relay map basis

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Dec 27, 2023 at 6:01 PM Philo Lu <lulie@xxxxxxxxxxxxxxxxx> wrote:
>
> BPF_MAP_TYPE_RELAY is implemented based on relay interface, which
> creates per-cpu buffer to transfer data. Each buffer is essentially a
> list of fix-sized sub-buffers, and is exposed to user space as files in
> debugfs. All of these compose a "relay chanel", which is the kernel of a
> relay map.
>
> Currently, attr->max_entries is used as subbuf size and attr->map_extra
> is used as subbuf num, and the default value of subbuf num is 8. A new
> map flag named BPF_F_OVERWRITE is also introduced to set overwrite mode
> of relay map.
>
> A relay map is represented as a directory in debugfs, and the per-cpu
> buffers are files in this directory. Users can get the data through read
> or mmap.
>
> To avoid directory name conflicting, relay_map_update_elem is provided
> to set the name. In fact, we create the relay channel and buffers with
> BPF_MAP_CREATE, and create relay files and bind them with the channel
> using BPF_MAP_UPDATE_ELEM. Generally, map_update_elem should be called
> once and only once.
>
> Here is an example:
> ```
> struct {
> __uint(type, BPF_MAP_TYPE_RELAY);
> __uint(max_entries, 4096);
> } my_relay SEC(".maps");
> ...
> char dir_name[] = "relay_test";
> bpf_map_update_elem(map_fd, NULL, dir_name, 0);
> ```
>
> Assume there are 2 cpus, we will have 2 files:
> ```
> /sys/kerenl/debug/relay_test/my_relay0
> /sys/kerenl/debug/relay_test/my_relay1
> ```

Is there a specific reason why relayfs necessitates creating an
individual file for each CPU? Alternatively, are there any approaches
available to collectively expose all CPUs using a single file?

When dealing with a large number of available CPUs, such as 236,
reading the buffer using the command `cat
/sys/kernel/debug/relay_test/my_relay{0...236} | awk '{}' ` can become
a bit cumbersome and tedious.

> Each represents a per-cpu buffer with size 8 * 4096 B (there are 8
> subbufs by default, each with size 4096B).
>
> Signed-off-by: Philo Lu <lulie@xxxxxxxxxxxxxxxxx>
> ---
>  include/linux/bpf_types.h |   3 +
>  include/uapi/linux/bpf.h  |   7 ++
>  kernel/bpf/Makefile       |   3 +
>  kernel/bpf/relaymap.c     | 199 ++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c      |   2 +
>  5 files changed, 214 insertions(+)
>  create mode 100644 kernel/bpf/relaymap.c
>
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index fc0d6f32c687..c122d7b494c5 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -132,6 +132,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
> +#ifdef CONFIG_RELAY
> +BPF_MAP_TYPE(BPF_MAP_TYPE_RELAY, relay_map_ops)
> +#endif
>
>  BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
>  BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 754e68ca8744..143b75676bd3 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -951,6 +951,7 @@ enum bpf_map_type {
>         BPF_MAP_TYPE_BLOOM_FILTER,
>         BPF_MAP_TYPE_USER_RINGBUF,
>         BPF_MAP_TYPE_CGRP_STORAGE,
> +       BPF_MAP_TYPE_RELAY,
>  };
>
>  /* Note that tracing related programs such as
> @@ -1330,6 +1331,9 @@ enum {
>
>  /* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */
>         BPF_F_PATH_FD           = (1U << 14),
> +
> +/* Enable overwrite for relay map */
> +       BPF_F_OVERWRITE         = (1U << 15),
>  };
>
>  /* Flags for BPF_PROG_QUERY. */
> @@ -1401,6 +1405,9 @@ union bpf_attr {
>                  * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
>                  * number of hash functions (if 0, the bloom filter will default
>                  * to using 5 hash functions).
> +                *
> +                * BPF_MAP_TYPE_RELAY - the lowest 32 bits indicate the number of
> +                * relay subbufs (if 0, the number will be set to 8 by default).
>                  */
>                 __u64   map_extra;
>         };
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index f526b7573e97..45b35bb0e572 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -10,6 +10,9 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
>  obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
>  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
>  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
> +ifeq ($(CONFIG_RELAY),y)
> +obj-$(CONFIG_BPF_SYSCALL) += relaymap.o
> +endif
>  obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
>  obj-${CONFIG_BPF_LSM}    += bpf_inode_storage.o
>  obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
> diff --git a/kernel/bpf/relaymap.c b/kernel/bpf/relaymap.c
> new file mode 100644
> index 000000000000..02b33a8e6b6c
> --- /dev/null
> +++ b/kernel/bpf/relaymap.c
> @@ -0,0 +1,199 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/cpumask.h>
> +#include <linux/debugfs.h>
> +#include <linux/filter.h>
> +#include <linux/relay.h>
> +#include <linux/slab.h>
> +#include <linux/bpf.h>
> +#include <linux/err.h>
> +
> +#define RELAY_CREATE_FLAG_MASK (BPF_F_OVERWRITE)
> +
> +struct bpf_relay_map {
> +       struct bpf_map map;
> +       struct rchan *relay_chan;
> +       struct rchan_callbacks relay_cb;
> +};
> +
> +static struct dentry *create_buf_file_handler(const char *filename,
> +                                      struct dentry *parent, umode_t mode,
> +                                      struct rchan_buf *buf, int *is_global)
> +{
> +       /* Because we do relay_late_setup_files(), create_buf_file(NULL, NULL, ...)
> +        * will be called by relay_open.
> +        */
> +       if (!filename)
> +               return NULL;
> +
> +       return debugfs_create_file(filename, mode, parent, buf,
> +                                  &relay_file_operations);
> +}
> +
> +static int remove_buf_file_handler(struct dentry *dentry)
> +{
> +       debugfs_remove(dentry);
> +       return 0;
> +}
> +
> +/* For non-overwrite, use default subbuf_start cb */
> +static int subbuf_start_overwrite(struct rchan_buf *buf, void *subbuf,
> +                                      void *prev_subbuf, size_t prev_padding)
> +{
> +       return 1;
> +}
> +
> +/* bpf_attr is used as follows:
> + * - key size: must be 0
> + * - value size: value will be used as directory name by map_update_elem
> + *   (to create relay files). If passed as 0, it will be set to NAME_MAX as
> + *   default
> + *
> + * - max_entries: subbuf size
> + * - map_extra: subbuf num, default as 8
> + *
> + * When alloc, we do not set up relay files considering dir_name conflicts.
> + * Instead we use relay_late_setup_files() in map_update_elem(), and thus the
> + * value is used as dir_name, and map->name is used as base_filename.
> + */
> +static struct bpf_map *relay_map_alloc(union bpf_attr *attr)
> +{
> +       struct bpf_relay_map *rmap;
> +
> +       if (unlikely(attr->map_flags & ~RELAY_CREATE_FLAG_MASK))
> +               return ERR_PTR(-EINVAL);
> +
> +       /* key size must be 0 in relay map */
> +       if (unlikely(attr->key_size))
> +               return ERR_PTR(-EINVAL);
> +
> +       /* value size is used as directory name length */
> +       if (unlikely(attr->value_size > NAME_MAX)) {
> +               pr_warn("value_size should be no more than %d\n", NAME_MAX);
> +               return ERR_PTR(-EINVAL);
> +       } else if (attr->value_size == 0)
> +               attr->value_size = NAME_MAX;
> +
> +       /* set default subbuf num */
> +       if (unlikely(attr->map_extra & ~UINT_MAX))
> +               return ERR_PTR(-EINVAL);
> +       attr->map_extra = attr->map_extra & UINT_MAX;
> +       if (!attr->map_extra)
> +               attr->map_extra = 8;
> +
> +       if (strlen(attr->map_name) == 0)
> +               return ERR_PTR(-EINVAL);
> +
> +       rmap = bpf_map_area_alloc(sizeof(*rmap), NUMA_NO_NODE);
> +       if (!rmap)
> +               return ERR_PTR(-ENOMEM);
> +
> +       bpf_map_init_from_attr(&rmap->map, attr);
> +
> +       rmap->relay_cb.create_buf_file = create_buf_file_handler;
> +       rmap->relay_cb.remove_buf_file = remove_buf_file_handler;
> +       if (attr->map_flags & BPF_F_OVERWRITE)
> +               rmap->relay_cb.subbuf_start = subbuf_start_overwrite;
> +
> +       rmap->relay_chan = relay_open(NULL, NULL,
> +                               attr->max_entries, attr->map_extra,
> +                               &rmap->relay_cb, NULL);
> +       if (!rmap->relay_chan) {
> +               bpf_map_area_free(rmap);
> +               return ERR_PTR(-EINVAL);
> +       }
> +
> +       return &rmap->map;
> +}
> +
> +static void relay_map_free(struct bpf_map *map)
> +{
> +       struct bpf_relay_map *rmap;
> +       struct dentry *parent;
> +
> +       rmap = container_of(map, struct bpf_relay_map, map);
> +
> +       parent = rmap->relay_chan->parent;
> +       relay_close(rmap->relay_chan);
> +       /* relay_chan->parent should be removed mannually if exists. */
> +       debugfs_remove_recursive(parent);
> +       bpf_map_area_free(rmap);
> +}
> +
> +static void *relay_map_lookup_elem(struct bpf_map *map, void *key)
> +{
> +       return ERR_PTR(-EOPNOTSUPP);
> +}
> +
> +static long relay_map_update_elem(struct bpf_map *map, void *key, void *value,
> +                                  u64 flags)
> +{
> +       struct bpf_relay_map *rmap;
> +       struct dentry *parent;
> +       int err;
> +
> +       if (unlikely(flags))
> +               return -EINVAL;
> +
> +       if (unlikely(key))
> +               return -EINVAL;
> +
> +       /* If the directory already exists, debugfs_create_dir will fail. It could
> +        * have been created by map_update_elem before, or another system that uses
> +        * debugfs.
> +        *
> +        * Note that the directory name passed as value should not be longer than
> +        * map->value_size, including the '\0' at the end.
> +        */
> +       ((char *)value)[map->value_size - 1] = '\0';
> +       parent = debugfs_create_dir(value, NULL);
> +       if (IS_ERR_OR_NULL(parent))
> +               return PTR_ERR(parent);
> +
> +       /* We don't need a lock here, because the relay channel is protected in
> +        * relay_late_setup_files() with a mutex.
> +        */
> +       rmap = container_of(map, struct bpf_relay_map, map);
> +       err = relay_late_setup_files(rmap->relay_chan, map->name, parent);
> +       if (err) {
> +               debugfs_remove_recursive(parent);
> +               return err;
> +       }
> +
> +       return 0;
> +}
> +
> +static long relay_map_delete_elem(struct bpf_map *map, void *key)
> +{
> +       return -EOPNOTSUPP;
> +}
> +
> +static int relay_map_get_next_key(struct bpf_map *map, void *key,
> +                                   void *next_key)
> +{
> +       return -EOPNOTSUPP;
> +}
> +
> +static u64 relay_map_mem_usage(const struct bpf_map *map)
> +{
> +       struct bpf_relay_map *rmap;
> +       u64 usage = sizeof(struct bpf_relay_map);
> +
> +       rmap = container_of(map, struct bpf_relay_map, map);
> +       usage += sizeof(struct rchan);
> +       usage += (sizeof(struct rchan_buf) + rmap->relay_chan->alloc_size)
> +                        * num_online_cpus();
> +       return usage;
> +}
> +
> +BTF_ID_LIST_SINGLE(relay_map_btf_ids, struct, bpf_relay_map)
> +const struct bpf_map_ops relay_map_ops = {
> +       .map_meta_equal = bpf_map_meta_equal,
> +       .map_alloc = relay_map_alloc,
> +       .map_free = relay_map_free,
> +       .map_lookup_elem = relay_map_lookup_elem,
> +       .map_update_elem = relay_map_update_elem,
> +       .map_delete_elem = relay_map_delete_elem,
> +       .map_get_next_key = relay_map_get_next_key,
> +       .map_mem_usage = relay_map_mem_usage,
> +       .map_btf_id = &relay_map_btf_ids[0],
> +};
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 1bf9805ee185..d6b7949e29c7 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1147,6 +1147,7 @@ static int map_create(union bpf_attr *attr)
>         }
>
>         if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
> +           attr->map_type != BPF_MAP_TYPE_RELAY &&
>             attr->map_extra != 0)
>                 return -EINVAL;
>
> @@ -1202,6 +1203,7 @@ static int map_create(union bpf_attr *attr)
>         case BPF_MAP_TYPE_USER_RINGBUF:
>         case BPF_MAP_TYPE_CGROUP_STORAGE:
>         case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
> +       case BPF_MAP_TYPE_RELAY:
>                 /* unprivileged */
>                 break;
>         case BPF_MAP_TYPE_SK_STORAGE:
> --
> 2.32.0.3.g01195cf9f
>


-- 
Regards
Yafang





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux