On Fri, Jul 29, 2022 at 03:23:16PM +0000, Yafang Shao wrote: > A new member memcg_fd is introduced into bpf attr of BPF_MAP_CREATE > command, which is the fd of an opened cgroup directory. In this cgroup, > the memory subsystem must be enabled. This value is valid only when > BPF_F_SELECTABLE_MEMCG is set in map_flags. Once the kernel get the > memory cgroup from this fd, it will set this memcg into bpf map, then > all the subsequent memory allocation of this map will be charge to the > memcg. > > The map creation paths in libbpf are also changed consequently. > > Currently it is only supported for cgroup2 directory. > > The usage of this new member as follows, > struct bpf_map_create_opts map_opts = { > .sz = sizeof(map_opts), > .map_flags = BPF_F_SELECTABLE_MEMCG, > }; > int memcg_fd, int map_fd; > int key, value; > > memcg_fd = open("/cgroup2", O_DIRECTORY); > if (memcg_fd < 0) { > perror("memcg dir open"); > return -1; > } > > map_opts.memcg_fd = memcg_fd; > map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, "map_for_memcg", > sizeof(key), sizeof(value), > 1024, &map_opts); > if (map_fd <= 0) { > perror("map create"); > return -1; > } Overall the api extension makes sense. The flexibility of selecting memcg is useful. > Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx> > --- > include/uapi/linux/bpf.h | 2 ++ > kernel/bpf/syscall.c | 47 ++++++++++++++++++++++++++-------- > tools/include/uapi/linux/bpf.h | 2 ++ > tools/lib/bpf/bpf.c | 1 + > tools/lib/bpf/bpf.h | 3 ++- > tools/lib/bpf/libbpf.c | 2 ++ > 6 files changed, 46 insertions(+), 11 deletions(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index d5fc1ea70b59..a6e02c8be924 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -1296,6 +1296,8 @@ union bpf_attr { > * struct stored as the > * map value > */ > + __s32 memcg_fd; /* selectable memcg */ > + __s32 :32; /* hole */ new fields cannot be inserted in the middle of uapi struct. > /* Any per-map-type extra fields > * > * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 6401cc417fa9..9900e2b87315 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -402,14 +402,30 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) > } > > #ifdef CONFIG_MEMCG_KMEM > -static void bpf_map_save_memcg(struct bpf_map *map) > +static int bpf_map_save_memcg(struct bpf_map *map, union bpf_attr *attr) > { > - /* Currently if a map is created by a process belonging to the root > - * memory cgroup, get_obj_cgroup_from_current() will return NULL. > - * So we have to check map->objcg for being NULL each time it's > - * being used. > - */ > - map->objcg = get_obj_cgroup_from_current(); > + struct obj_cgroup *objcg; > + struct cgroup *cgrp; > + > + if (attr->map_flags & BPF_F_SELECTABLE_MEMCG) { The flag is unnecessary. Just add memcg_fd to the end of attr and use != 0 as a condition that it should be used instead of get_obj_cgroup_from_current(). There are other parts of bpf uapi that have similar fd handling logic. > + cgrp = cgroup_get_from_fd(attr->memcg_fd); > + if (IS_ERR(cgrp)) > + return -EINVAL; > + > + objcg = get_obj_cgroup_from_cgroup(cgrp); > + if (IS_ERR(objcg)) > + return PTR_ERR(objcg); > + } else { > + /* Currently if a map is created by a process belonging to the root > + * memory cgroup, get_obj_cgroup_from_current() will return NULL. > + * So we have to check map->objcg for being NULL each time it's > + * being used. > + */ > + objcg = get_obj_cgroup_from_current(); > + } > + > + map->objcg = objcg; > + return 0; > } > > static void bpf_map_release_memcg(struct bpf_map *map) > @@ -485,8 +501,9 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, > } > > #else > -static void bpf_map_save_memcg(struct bpf_map *map) > +static int bpf_map_save_memcg(struct bpf_map *map, union bpf_attr *attr) > { > + return 0; > } > > static void bpf_map_release_memcg(struct bpf_map *map) > @@ -530,13 +547,18 @@ void *bpf_map_container_alloc(union bpf_attr *attr, u64 size, int numa_node) High level uapi struct should not be passed into low level helper like this. Pls pass memcg_fd instead. > { > struct bpf_map *map; > void *container; > + int ret; > > container = __bpf_map_area_alloc(size, numa_node, false); > if (!container) > return ERR_PTR(-ENOMEM); > > map = (struct bpf_map *)container; > - bpf_map_save_memcg(map); > + ret = bpf_map_save_memcg(map, attr); > + if (ret) { > + bpf_map_area_free(container); > + return ERR_PTR(ret); > + } > > return container; > } > @@ -547,6 +569,7 @@ void *bpf_map_container_mmapable_alloc(union bpf_attr *attr, u64 size, > struct bpf_map *map; > void *container; > void *ptr; > + int ret; > > /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ > ptr = __bpf_map_area_alloc(size, numa_node, true); > @@ -555,7 +578,11 @@ void *bpf_map_container_mmapable_alloc(union bpf_attr *attr, u64 size, > > container = ptr + align - offset; > map = (struct bpf_map *)container; > - bpf_map_save_memcg(map); > + ret = bpf_map_save_memcg(map, attr); > + if (ret) { > + bpf_map_area_free(ptr); > + return ERR_PTR(ret); > + } > > return ptr; > } > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index d5fc1ea70b59..a6e02c8be924 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -1296,6 +1296,8 @@ union bpf_attr { > * struct stored as the > * map value > */ > + __s32 memcg_fd; /* selectable memcg */ > + __s32 :32; /* hole */ > /* Any per-map-type extra fields > * > * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the > diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c > index 5eb0df90eb2b..662ce5808386 100644 > --- a/tools/lib/bpf/bpf.c > +++ b/tools/lib/bpf/bpf.c > @@ -199,6 +199,7 @@ int bpf_map_create(enum bpf_map_type map_type, > attr.map_extra = OPTS_GET(opts, map_extra, 0); > attr.numa_node = OPTS_GET(opts, numa_node, 0); > attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0); > + attr.memcg_fd = OPTS_GET(opts, memcg_fd, 0); > > fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz); > return libbpf_err_errno(fd); > diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h > index 88a7cc4bd76f..481aad49422b 100644 > --- a/tools/lib/bpf/bpf.h > +++ b/tools/lib/bpf/bpf.h > @@ -51,8 +51,9 @@ struct bpf_map_create_opts { > > __u32 numa_node; > __u32 map_ifindex; > + __u32 memcg_fd; > }; > -#define bpf_map_create_opts__last_field map_ifindex > +#define bpf_map_create_opts__last_field memcg_fd > > LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, > const char *map_name, > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c > index 50d41815f431..86916d550031 100644 > --- a/tools/lib/bpf/libbpf.c > +++ b/tools/lib/bpf/libbpf.c > @@ -505,6 +505,7 @@ struct bpf_map { > bool pinned; > bool reused; > bool autocreate; > + __s32 memcg_fd; > __u64 map_extra; > }; > > @@ -4928,6 +4929,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b > create_attr.map_ifindex = map->map_ifindex; > create_attr.map_flags = def->map_flags; > create_attr.numa_node = map->numa_node; > + create_attr.memcg_fd = map->memcg_fd; > create_attr.map_extra = map->map_extra; > > if (bpf_map__is_struct_ops(map)) > -- > 2.17.1 >