On Mon, Jan 27, 2025 at 8:22 AM Leon Hwang <leon.hwang@xxxxxxxxx> wrote: > > This patch introduces support for global percpu data in libbpf. A new > section named ".percpu" is added, similar to the existing ".data" section. > Internal maps are created for ".percpu" sections, which are then > initialized and populated accordingly. > > The changes include: > > * Introduction of the ".percpu" section in libbpf. > * Creation of internal maps for percpu data. > * Initialization and population of these maps. > > This enhancement allows BPF programs to efficiently manage and access > percpu global data, improving performance for use cases that require > percpu buffer. > > Signed-off-by: Leon Hwang <leon.hwang@xxxxxxxxx> > --- > tools/lib/bpf/libbpf.c | 172 ++++++++++++++++++++++++++++++++--------- > 1 file changed, 135 insertions(+), 37 deletions(-) > > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c > index 194809da51725..6da6004c5c84d 100644 > --- a/tools/lib/bpf/libbpf.c > +++ b/tools/lib/bpf/libbpf.c > @@ -516,6 +516,7 @@ struct bpf_struct_ops { > }; > > #define DATA_SEC ".data" > +#define PERCPU_DATA_SEC ".percpu" > #define BSS_SEC ".bss" > #define RODATA_SEC ".rodata" > #define KCONFIG_SEC ".kconfig" > @@ -530,6 +531,7 @@ enum libbpf_map_type { > LIBBPF_MAP_BSS, > LIBBPF_MAP_RODATA, > LIBBPF_MAP_KCONFIG, > + LIBBPF_MAP_PERCPU_DATA, nit: let's keep it shorter: LIBBPF_MAP_PERCPU > }; > > struct bpf_map_def { > @@ -562,6 +564,7 @@ struct bpf_map { > __u32 btf_value_type_id; > __u32 btf_vmlinux_value_type_id; > enum libbpf_map_type libbpf_type; > + void *data; > void *mmaped; > struct bpf_struct_ops *st_ops; > struct bpf_map *inner_map; > @@ -640,6 +643,7 @@ enum sec_type { > SEC_DATA, > SEC_RODATA, > SEC_ST_OPS, > + SEC_PERCPU_DATA, ditto, just SEC_PERCPU? > }; > > struct elf_sec_desc { > @@ -1923,13 +1927,24 @@ static bool map_is_mmapable(struct bpf_object *obj, struct bpf_map *map) > return false; > } > > +static void map_copy_data(struct bpf_map *map, const void *data) > +{ > + bool is_percpu_data = map->libbpf_type == LIBBPF_MAP_PERCPU_DATA; > + size_t data_sz = map->def.value_size; > + > + if (data) > + memcpy(is_percpu_data ? map->data : map->mmaped, data, data_sz); > +} > + > static int > bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, > const char *real_name, int sec_idx, void *data, size_t data_sz) > { > + bool is_percpu_data = type == LIBBPF_MAP_PERCPU_DATA; > struct bpf_map_def *def; > struct bpf_map *map; > size_t mmap_sz; > + size_t elem_sz; nit: just: size_t mmap_sz, elem_sz; > int err; > > map = bpf_object__add_map(obj); > @@ -1948,7 +1963,8 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, > } > > def = &map->def; > - def->type = BPF_MAP_TYPE_ARRAY; > + def->type = is_percpu_data ? BPF_MAP_TYPE_PERCPU_ARRAY > + : BPF_MAP_TYPE_ARRAY; nit: single line > def->key_size = sizeof(int); > def->value_size = data_sz; > def->max_entries = 1; [...] > + if (map_is_mmapable(obj, map)) > + def->map_flags |= BPF_F_MMAPABLE; > + > + mmap_sz = bpf_map_mmap_sz(map); > + map->mmaped = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, > + MAP_SHARED | MAP_ANONYMOUS, -1, 0); > + if (map->mmaped == MAP_FAILED) { > + err = -errno; > + map->mmaped = NULL; > + pr_warn("map '%s': failed to alloc content buffer: %s\n", > + map->name, errstr(err)); > + goto free_name; > + } > + > + map_copy_data(map, data); why not memcpy() here? you know it's not percpu map, so why obscuring that memcpy? > + } > > pr_debug("map %td is \"%s\"\n", map - obj->maps, map->name); > return 0; [...] > @@ -5125,23 +5180,54 @@ static int > bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) > { > enum libbpf_map_type map_type = map->libbpf_type; > - int err, zero = 0; > + bool is_percpu_data = map_type == LIBBPF_MAP_PERCPU_DATA; > + int err = 0, zero = 0; > + void *data = NULL; > + int num_cpus, i; > + size_t data_sz; > + size_t elem_sz; > size_t mmap_sz; nit: keep those size_t variables grouped: `size_t mmap_sz, data_sz, elem_sz;` > > + data_sz = map->def.value_size; > + if (is_percpu_data) { > + num_cpus = libbpf_num_possible_cpus(); > + if (num_cpus < 0) { > + err = libbpf_err_errno(num_cpus); why? num_cpus *IS* error code if num_cpus < 0 > + pr_warn("map '%s': failed to get num_cpus: %s\n", > + bpf_map__name(map), errstr(err)); this is unlikely to happen, I'd drop pr_warn() > + return err; > + } > + > + elem_sz = roundup(data_sz, 8); > + data_sz = elem_sz * num_cpus; > + data = malloc(data_sz); > + if (!data) { > + err = -ENOMEM; > + pr_warn("map '%s': failed to malloc memory: %s\n", > + bpf_map__name(map), errstr(err)); -ENOMEM is rather self-descriptive (and generally not expected), so don't add pr_warn() for such cases > + return err; > + } > + > + for (i = 0; i < num_cpus; i++) > + memcpy(data + i * elem_sz, map->data, elem_sz); > + } else { > + data = map->mmaped; > + } > + [...] > static void bpf_map__destroy(struct bpf_map *map); > @@ -8120,7 +8209,9 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) > struct bpf_map *m; > > bpf_object__for_each_map(m, obj) { > - if (!bpf_map__is_internal(m)) > + if (!bpf_map__is_internal(m) || > + /* percpu data map is internal and not-mmapable. */ > + m->libbpf_type == LIBBPF_MAP_PERCPU_DATA) original logic would work anyways, no? let's not add unnecessary special casing here > continue; > if (!kernel_supports(obj, FEAT_ARRAY_MMAP)) > m->def.map_flags &= ~BPF_F_MMAPABLE; > @@ -9041,6 +9132,8 @@ static void bpf_map__destroy(struct bpf_map *map) > if (map->mmaped && map->mmaped != map->obj->arena_data) > munmap(map->mmaped, bpf_map_mmap_sz(map)); > map->mmaped = NULL; > + if (map->data) > + zfree(&map->data); > this whole map->mmaped and map->data duality and duplication seems not worth it, tbh. Maybe we should keep using map->mmaped (we probably could name it more generically at some point, but I don't want to start bike shedding now) even for malloc'ed memory? After all, we already have ARENA as another special case? WDYT, can your changes be implemented by reusing map->mmaped, taking into account a type of map? pw-bot: cr > if (map->st_ops) { > zfree(&map->st_ops->data); > @@ -10132,14 +10225,18 @@ int bpf_map__fd(const struct bpf_map *map) > > static bool map_uses_real_name(const struct bpf_map *map) > { > - /* Since libbpf started to support custom .data.* and .rodata.* maps, > - * their user-visible name differs from kernel-visible name. Users see > - * such map's corresponding ELF section name as a map name. > - * This check distinguishes .data/.rodata from .data.* and .rodata.* > - * maps to know which name has to be returned to the user. > + /* Since libbpf started to support custom .data.*, .percpu.* and > + * .rodata.* maps, their user-visible name differs from kernel-visible > + * name. Users see such map's corresponding ELF section name as a map > + * name. This check distinguishes .data/.percpu/.rodata from .data.*, > + * .percpu.* and .rodata.* maps to know which name has to be returned to > + * the user. > */ > if (map->libbpf_type == LIBBPF_MAP_DATA && strcmp(map->real_name, DATA_SEC) != 0) > return true; > + if (map->libbpf_type == LIBBPF_MAP_PERCPU_DATA && > + strcmp(map->real_name, PERCPU_DATA_SEC) != 0) > + return true; nit: shorten LIBBPF_MAP_PERCPU_DATA and keep single line, please > if (map->libbpf_type == LIBBPF_MAP_RODATA && strcmp(map->real_name, RODATA_SEC) != 0) > return true; > return false; > @@ -10348,7 +10445,8 @@ int bpf_map__set_initial_value(struct bpf_map *map, > if (map->obj->loaded || map->reused) > return libbpf_err(-EBUSY); > > - if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG) > + if ((!map->mmaped && !map->data) || > + map->libbpf_type == LIBBPF_MAP_KCONFIG) > return libbpf_err(-EINVAL); > > if (map->def.type == BPF_MAP_TYPE_ARENA) > @@ -10358,7 +10456,7 @@ int bpf_map__set_initial_value(struct bpf_map *map, > if (size != actual_sz) > return libbpf_err(-EINVAL); > > - memcpy(map->mmaped, data, size); > + map_copy_data(map, data); > return 0; > } > > @@ -10370,7 +10468,7 @@ void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize) > return map->st_ops->data; > } > > - if (!map->mmaped) > + if (!map->mmaped && !map->data) > return NULL; > > if (map->def.type == BPF_MAP_TYPE_ARENA) > @@ -10378,7 +10476,7 @@ void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize) > else > *psize = map->def.value_size; > > - return map->mmaped; > + return map->libbpf_type == LIBBPF_MAP_PERCPU_DATA ? map->data : map->mmaped; Good chunk of changes like this wouldn't be necessary if we just reuse map->mmaped. > } > > bool bpf_map__is_internal(const struct bpf_map *map) > -- > 2.47.1 >