On 6/2/25 08:09, Andrii Nakryiko wrote: > On Mon, Jan 27, 2025 at 8:22 AM Leon Hwang <leon.hwang@xxxxxxxxx> wrote: >> >> This patch introduces support for global percpu data in libbpf. A new >> section named ".percpu" is added, similar to the existing ".data" section. >> Internal maps are created for ".percpu" sections, which are then >> initialized and populated accordingly. >> >> The changes include: >> >> * Introduction of the ".percpu" section in libbpf. >> * Creation of internal maps for percpu data. >> * Initialization and population of these maps. >> >> This enhancement allows BPF programs to efficiently manage and access >> percpu global data, improving performance for use cases that require >> percpu buffer. >> >> Signed-off-by: Leon Hwang <leon.hwang@xxxxxxxxx> >> --- >> tools/lib/bpf/libbpf.c | 172 ++++++++++++++++++++++++++++++++--------- >> 1 file changed, 135 insertions(+), 37 deletions(-) >> >> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c >> index 194809da51725..6da6004c5c84d 100644 >> --- a/tools/lib/bpf/libbpf.c >> +++ b/tools/lib/bpf/libbpf.c >> @@ -516,6 +516,7 @@ struct bpf_struct_ops { >> }; >> >> #define DATA_SEC ".data" >> +#define PERCPU_DATA_SEC ".percpu" >> #define BSS_SEC ".bss" >> #define RODATA_SEC ".rodata" >> #define KCONFIG_SEC ".kconfig" >> @@ -530,6 +531,7 @@ enum libbpf_map_type { >> LIBBPF_MAP_BSS, >> LIBBPF_MAP_RODATA, >> LIBBPF_MAP_KCONFIG, >> + LIBBPF_MAP_PERCPU_DATA, > > nit: let's keep it shorter: LIBBPF_MAP_PERCPU > Ack. >> }; >> >> struct bpf_map_def { >> @@ -562,6 +564,7 @@ struct bpf_map { >> __u32 btf_value_type_id; >> __u32 btf_vmlinux_value_type_id; >> enum libbpf_map_type libbpf_type; >> + void *data; >> void *mmaped; >> struct bpf_struct_ops *st_ops; >> struct bpf_map *inner_map; >> @@ -640,6 +643,7 @@ enum sec_type { >> SEC_DATA, >> SEC_RODATA, >> SEC_ST_OPS, >> + SEC_PERCPU_DATA, > > ditto, just SEC_PERCPU? > Ack. >> }; >> >> struct elf_sec_desc { >> @@ -1923,13 +1927,24 @@ static bool map_is_mmapable(struct bpf_object *obj, struct bpf_map *map) >> return false; >> } >> >> +static void map_copy_data(struct bpf_map *map, const void *data) >> +{ >> + bool is_percpu_data = map->libbpf_type == LIBBPF_MAP_PERCPU_DATA; >> + size_t data_sz = map->def.value_size; >> + >> + if (data) >> + memcpy(is_percpu_data ? map->data : map->mmaped, data, data_sz); >> +} >> + >> static int >> bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, >> const char *real_name, int sec_idx, void *data, size_t data_sz) >> { >> + bool is_percpu_data = type == LIBBPF_MAP_PERCPU_DATA; >> struct bpf_map_def *def; >> struct bpf_map *map; >> size_t mmap_sz; >> + size_t elem_sz; > > nit: just: > > size_t mmap_sz, elem_sz; > >> int err; >> >> map = bpf_object__add_map(obj); >> @@ -1948,7 +1963,8 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, >> } >> >> def = &map->def; >> - def->type = BPF_MAP_TYPE_ARRAY; >> + def->type = is_percpu_data ? BPF_MAP_TYPE_PERCPU_ARRAY >> + : BPF_MAP_TYPE_ARRAY; > > nit: single line > >> def->key_size = sizeof(int); >> def->value_size = data_sz; >> def->max_entries = 1; > > [...] > >> + if (map_is_mmapable(obj, map)) >> + def->map_flags |= BPF_F_MMAPABLE; >> + >> + mmap_sz = bpf_map_mmap_sz(map); >> + map->mmaped = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, >> + MAP_SHARED | MAP_ANONYMOUS, -1, 0); >> + if (map->mmaped == MAP_FAILED) { >> + err = -errno; >> + map->mmaped = NULL; >> + pr_warn("map '%s': failed to alloc content buffer: %s\n", >> + map->name, errstr(err)); >> + goto free_name; >> + } >> + >> + map_copy_data(map, data); > > why not memcpy() here? you know it's not percpu map, so why obscuring > that memcpy? > > >> + } >> >> pr_debug("map %td is \"%s\"\n", map - obj->maps, map->name); >> return 0; > > [...] > >> @@ -5125,23 +5180,54 @@ static int >> bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) >> { >> enum libbpf_map_type map_type = map->libbpf_type; >> - int err, zero = 0; >> + bool is_percpu_data = map_type == LIBBPF_MAP_PERCPU_DATA; >> + int err = 0, zero = 0; >> + void *data = NULL; >> + int num_cpus, i; >> + size_t data_sz; >> + size_t elem_sz; >> size_t mmap_sz; > > nit: keep those size_t variables grouped: `size_t mmap_sz, data_sz, elem_sz;` > Ack. >> >> + data_sz = map->def.value_size; >> + if (is_percpu_data) { >> + num_cpus = libbpf_num_possible_cpus(); >> + if (num_cpus < 0) { >> + err = libbpf_err_errno(num_cpus); > > why? num_cpus *IS* error code if num_cpus < 0 > Ack. Thanks for pointing this out. >> + pr_warn("map '%s': failed to get num_cpus: %s\n", >> + bpf_map__name(map), errstr(err)); > > this is unlikely to happen, I'd drop pr_warn() > Ack. >> + return err; >> + } >> + >> + elem_sz = roundup(data_sz, 8); >> + data_sz = elem_sz * num_cpus; >> + data = malloc(data_sz); >> + if (!data) { >> + err = -ENOMEM; >> + pr_warn("map '%s': failed to malloc memory: %s\n", >> + bpf_map__name(map), errstr(err)); > > -ENOMEM is rather self-descriptive (and generally not expected), so > don't add pr_warn() for such cases > Ack. >> + return err; >> + } >> + >> + for (i = 0; i < num_cpus; i++) >> + memcpy(data + i * elem_sz, map->data, elem_sz); >> + } else { >> + data = map->mmaped; >> + } >> + > > [...] > >> static void bpf_map__destroy(struct bpf_map *map); >> @@ -8120,7 +8209,9 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) >> struct bpf_map *m; >> >> bpf_object__for_each_map(m, obj) { >> - if (!bpf_map__is_internal(m)) >> + if (!bpf_map__is_internal(m) || >> + /* percpu data map is internal and not-mmapable. */ >> + m->libbpf_type == LIBBPF_MAP_PERCPU_DATA) > > original logic would work anyways, no? let's not add unnecessary > special casing here > The original logic works for it. I'll remove it. >> continue; >> if (!kernel_supports(obj, FEAT_ARRAY_MMAP)) >> m->def.map_flags &= ~BPF_F_MMAPABLE; >> @@ -9041,6 +9132,8 @@ static void bpf_map__destroy(struct bpf_map *map) >> if (map->mmaped && map->mmaped != map->obj->arena_data) >> munmap(map->mmaped, bpf_map_mmap_sz(map)); >> map->mmaped = NULL; >> + if (map->data) >> + zfree(&map->data); >> > > this whole map->mmaped and map->data duality and duplication seems not > worth it, tbh. Maybe we should keep using map->mmaped (we probably > could name it more generically at some point, but I don't want to > start bike shedding now) even for malloc'ed memory? After all, we > already have ARENA as another special case? WDYT, can your changes be > implemented by reusing map->mmaped, taking into account a type of map? > It is better to reuse map->mmaped. I'll take a try. > pw-bot: cr > >> if (map->st_ops) { >> zfree(&map->st_ops->data); >> @@ -10132,14 +10225,18 @@ int bpf_map__fd(const struct bpf_map *map) >> >> static bool map_uses_real_name(const struct bpf_map *map) >> { >> - /* Since libbpf started to support custom .data.* and .rodata.* maps, >> - * their user-visible name differs from kernel-visible name. Users see >> - * such map's corresponding ELF section name as a map name. >> - * This check distinguishes .data/.rodata from .data.* and .rodata.* >> - * maps to know which name has to be returned to the user. >> + /* Since libbpf started to support custom .data.*, .percpu.* and >> + * .rodata.* maps, their user-visible name differs from kernel-visible >> + * name. Users see such map's corresponding ELF section name as a map >> + * name. This check distinguishes .data/.percpu/.rodata from .data.*, >> + * .percpu.* and .rodata.* maps to know which name has to be returned to >> + * the user. >> */ >> if (map->libbpf_type == LIBBPF_MAP_DATA && strcmp(map->real_name, DATA_SEC) != 0) >> return true; >> + if (map->libbpf_type == LIBBPF_MAP_PERCPU_DATA && >> + strcmp(map->real_name, PERCPU_DATA_SEC) != 0) >> + return true; > > nit: shorten LIBBPF_MAP_PERCPU_DATA and keep single line, please > Ack. >> if (map->libbpf_type == LIBBPF_MAP_RODATA && strcmp(map->real_name, RODATA_SEC) != 0) >> return true; >> return false; >> @@ -10348,7 +10445,8 @@ int bpf_map__set_initial_value(struct bpf_map *map, >> if (map->obj->loaded || map->reused) >> return libbpf_err(-EBUSY); >> >> - if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG) >> + if ((!map->mmaped && !map->data) || >> + map->libbpf_type == LIBBPF_MAP_KCONFIG) >> return libbpf_err(-EINVAL); >> >> if (map->def.type == BPF_MAP_TYPE_ARENA) >> @@ -10358,7 +10456,7 @@ int bpf_map__set_initial_value(struct bpf_map *map, >> if (size != actual_sz) >> return libbpf_err(-EINVAL); >> >> - memcpy(map->mmaped, data, size); >> + map_copy_data(map, data); >> return 0; >> } >> >> @@ -10370,7 +10468,7 @@ void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize) >> return map->st_ops->data; >> } >> >> - if (!map->mmaped) >> + if (!map->mmaped && !map->data) >> return NULL; >> >> if (map->def.type == BPF_MAP_TYPE_ARENA) >> @@ -10378,7 +10476,7 @@ void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize) >> else >> *psize = map->def.value_size; >> >> - return map->mmaped; >> + return map->libbpf_type == LIBBPF_MAP_PERCPU_DATA ? map->data : map->mmaped; > > Good chunk of changes like this wouldn't be necessary if we just reuse > map->mmaped. > Yes. you're right. It's unnecessary to change it if reuse map->mmaped. > >> } >> >> bool bpf_map__is_internal(const struct bpf_map *map) >> -- >> 2.47.1 >> Thanks, Leon