Andrii Nakryiko <andriin@xxxxxx> [Sun, 2020-03-29 20:00 -0700]: > Implement new sub-command to attach cgroup BPF programs and return FD-based > bpf_link back on success. bpf_link, once attached to cgroup, cannot be > replaced, except by owner having its FD. Cgroup bpf_link supports only > BPF_F_ALLOW_MULTI semantics. Both link-based and prog-based BPF_F_ALLOW_MULTI > attachments can be freely intermixed. > > To prevent bpf_cgroup_link from keeping cgroup alive past the point when no > BPF program can be executed, implement auto-detachment of link. When > cgroup_bpf_release() is called, all attached bpf_links are forced to release > cgroup refcounts, but they leave bpf_link otherwise active and allocated, as > well as still owning underlying bpf_prog. This is because user-space might > still have FDs open and active, so bpf_link as a user-referenced object can't > be freed yet. Once last active FD is closed, bpf_link will be freed and > underlying bpf_prog refcount will be dropped. But cgroup refcount won't be > touched, because cgroup is released already. > > The inherent race between bpf_cgroup_link release (from closing last FD) and > cgroup_bpf_release() is resolved by both operations taking cgroup_mutex. So > the only additional check required is when bpf_cgroup_link attempts to detach > itself from cgroup. At that time we need to check whether there is still > cgroup associated with that link. And if not, exit with success, because > bpf_cgroup_link was already successfully detached. > > Acked-by: Roman Gushchin <guro@xxxxxx> > Signed-off-by: Andrii Nakryiko <andriin@xxxxxx> > --- > include/linux/bpf-cgroup.h | 29 ++- > include/linux/bpf.h | 10 +- > include/uapi/linux/bpf.h | 10 +- > kernel/bpf/cgroup.c | 315 +++++++++++++++++++++++++-------- > kernel/bpf/syscall.c | 61 ++++++- > kernel/cgroup/cgroup.c | 14 +- > tools/include/uapi/linux/bpf.h | 10 +- > 7 files changed, 351 insertions(+), 98 deletions(-) > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h > index a7cd5c7a2509..d2d969669564 100644 > --- a/include/linux/bpf-cgroup.h > +++ b/include/linux/bpf-cgroup.h > @@ -51,9 +51,18 @@ struct bpf_cgroup_storage { > struct rcu_head rcu; > }; > > +struct bpf_cgroup_link { > + struct bpf_link link; > + struct cgroup *cgroup; > + enum bpf_attach_type type; > +}; > + > +extern const struct bpf_link_ops bpf_cgroup_link_lops; > + > struct bpf_prog_list { > struct list_head node; > struct bpf_prog *prog; > + struct bpf_cgroup_link *link; > struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; > }; > > @@ -84,20 +93,23 @@ struct cgroup_bpf { > int cgroup_bpf_inherit(struct cgroup *cgrp); > void cgroup_bpf_offline(struct cgroup *cgrp); > > -int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > - struct bpf_prog *replace_prog, > +int __cgroup_bpf_attach(struct cgroup *cgrp, > + struct bpf_prog *prog, struct bpf_prog *replace_prog, > + struct bpf_cgroup_link *link, > enum bpf_attach_type type, u32 flags); > int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, > + struct bpf_cgroup_link *link, > enum bpf_attach_type type); > int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > union bpf_attr __user *uattr); > > /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ > -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > - struct bpf_prog *replace_prog, enum bpf_attach_type type, > +int cgroup_bpf_attach(struct cgroup *cgrp, > + struct bpf_prog *prog, struct bpf_prog *replace_prog, > + struct bpf_cgroup_link *link, enum bpf_attach_type type, > u32 flags); > int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, > - enum bpf_attach_type type, u32 flags); > + enum bpf_attach_type type); > int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > union bpf_attr __user *uattr); > > @@ -332,6 +344,7 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, > enum bpf_prog_type ptype, struct bpf_prog *prog); > int cgroup_bpf_prog_detach(const union bpf_attr *attr, > enum bpf_prog_type ptype); > +int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); > int cgroup_bpf_prog_query(const union bpf_attr *attr, > union bpf_attr __user *uattr); > #else > @@ -354,6 +367,12 @@ static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr, > return -EINVAL; > } > > +static inline int cgroup_bpf_link_attach(const union bpf_attr *attr, > + struct bpf_prog *prog) > +{ > + return -EINVAL; > +} > + > static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, > union bpf_attr __user *uattr) > { > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 3bde59a8453b..56254d880293 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1082,15 +1082,23 @@ extern int sysctl_unprivileged_bpf_disabled; > int bpf_map_new_fd(struct bpf_map *map, int flags); > int bpf_prog_new_fd(struct bpf_prog *prog); > > -struct bpf_link; > +struct bpf_link { > + atomic64_t refcnt; > + const struct bpf_link_ops *ops; > + struct bpf_prog *prog; > + struct work_struct work; > +}; > > struct bpf_link_ops { > void (*release)(struct bpf_link *link); > void (*dealloc)(struct bpf_link *link); > + > }; > > void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, > struct bpf_prog *prog); > +void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, > + int link_fd); > void bpf_link_inc(struct bpf_link *link); > void bpf_link_put(struct bpf_link *link); > int bpf_link_new_fd(struct bpf_link *link); > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index f1fbc36f58d3..8b3f1c098ac0 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -111,6 +111,7 @@ enum bpf_cmd { > BPF_MAP_LOOKUP_AND_DELETE_BATCH, > BPF_MAP_UPDATE_BATCH, > BPF_MAP_DELETE_BATCH, > + BPF_LINK_CREATE, > }; > > enum bpf_map_type { > @@ -541,7 +542,7 @@ union bpf_attr { > __u32 prog_cnt; > } query; > > - struct { > + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ > __u64 name; > __u32 prog_fd; > } raw_tracepoint; > @@ -569,6 +570,13 @@ union bpf_attr { > __u64 probe_offset; /* output: probe_offset */ > __u64 probe_addr; /* output: probe_addr */ > } task_fd_query; > + > + struct { /* struct used by BPF_LINK_CREATE command */ > + __u32 prog_fd; /* eBPF program to attach */ > + __u32 target_fd; /* object to attach to */ > + __u32 attach_type; /* attach type */ > + __u32 flags; /* extra flags */ > + } link_create; > } __attribute__((aligned(8))); > > /* The description below is an attempt at providing documentation to eBPF > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c > index 9c8472823a7f..c24029937431 100644 > --- a/kernel/bpf/cgroup.c > +++ b/kernel/bpf/cgroup.c > @@ -80,6 +80,17 @@ static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[]) > bpf_cgroup_storage_unlink(storages[stype]); > } > > +/* Called when bpf_cgroup_link is auto-detached from dying cgroup. > + * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It > + * doesn't free link memory, which will eventually be done by bpf_link's > + * release() callback, when its last FD is closed. > + */ > +static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link) > +{ > + cgroup_put(link->cgroup); > + link->cgroup = NULL; > +} > + > /** > * cgroup_bpf_release() - put references of all bpf programs and > * release all cgroup bpf data > @@ -100,7 +111,10 @@ static void cgroup_bpf_release(struct work_struct *work) > > list_for_each_entry_safe(pl, tmp, progs, node) { > list_del(&pl->node); > - bpf_prog_put(pl->prog); > + if (pl->prog) > + bpf_prog_put(pl->prog); > + if (pl->link) > + bpf_cgroup_link_auto_detach(pl->link); > bpf_cgroup_storages_unlink(pl->storage); > bpf_cgroup_storages_free(pl->storage); > kfree(pl); > @@ -134,6 +148,18 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) > queue_work(system_wq, &cgrp->bpf.release_work); > } > > +/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through > + * link or direct prog. > + */ > +static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) > +{ > + if (pl->prog) > + return pl->prog; > + if (pl->link) > + return pl->link->link.prog; > + return NULL; > +} > + > /* count number of elements in the list. > * it's slow but the list cannot be long > */ > @@ -143,7 +169,7 @@ static u32 prog_list_length(struct list_head *head) > u32 cnt = 0; > > list_for_each_entry(pl, head, node) { > - if (!pl->prog) > + if (!prog_list_prog(pl)) > continue; > cnt++; > } > @@ -212,11 +238,11 @@ static int compute_effective_progs(struct cgroup *cgrp, > continue; > > list_for_each_entry(pl, &p->bpf.progs[type], node) { > - if (!pl->prog) > + if (!prog_list_prog(pl)) > continue; > > item = &progs->items[cnt]; > - item->prog = pl->prog; > + item->prog = prog_list_prog(pl); > bpf_cgroup_storages_assign(item->cgroup_storage, > pl->storage); > cnt++; > @@ -333,19 +359,60 @@ static int update_effective_progs(struct cgroup *cgrp, > > #define BPF_CGROUP_MAX_PROGS 64 > > +static struct bpf_prog_list *find_attach_entry(struct list_head *progs, > + struct bpf_prog *prog, > + struct bpf_cgroup_link *link, > + struct bpf_prog *replace_prog, > + bool allow_multi) > +{ > + struct bpf_prog_list *pl; > + > + /* single-attach case */ > + if (!allow_multi) { > + if (list_empty(progs)) > + return NULL; > + return list_first_entry(progs, typeof(*pl), node); > + } > + > + list_for_each_entry(pl, progs, node) { > + if (prog && pl->prog == prog) > + /* disallow attaching the same prog twice */ > + return ERR_PTR(-EINVAL); > + if (link && pl->link == link) > + /* disallow attaching the same link twice */ > + return ERR_PTR(-EINVAL); > + } > + > + /* direct prog multi-attach w/ replacement case */ > + if (replace_prog) { > + list_for_each_entry(pl, progs, node) { > + if (pl->prog == replace_prog) > + /* a match found */ > + return pl; > + } > + /* prog to replace not found for cgroup */ > + return ERR_PTR(-ENOENT); > + } > + > + return NULL; > +} > + > /** > - * __cgroup_bpf_attach() - Attach the program to a cgroup, and > + * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and > * propagate the change to descendants > * @cgrp: The cgroup which descendants to traverse > * @prog: A program to attach > + * @link: A link to attach > * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set > * @type: Type of attach operation > * @flags: Option flags > * > + * Exactly one of @prog or @link can be non-null. > * Must be called with cgroup_mutex held. > */ > -int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > - struct bpf_prog *replace_prog, > +int __cgroup_bpf_attach(struct cgroup *cgrp, > + struct bpf_prog *prog, struct bpf_prog *replace_prog, > + struct bpf_cgroup_link *link, > enum bpf_attach_type type, u32 flags) > { > u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); > @@ -353,13 +420,19 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > struct bpf_prog *old_prog = NULL; > struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], > *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; > - struct bpf_prog_list *pl, *replace_pl = NULL; > + struct bpf_prog_list *pl; > int err; > > if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || > ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) > /* invalid combination */ > return -EINVAL; > + if (link && (prog || replace_prog)) > + /* only either link or prog/replace_prog can be specified */ > + return -EINVAL; > + if (!!replace_prog != !!(flags & BPF_F_REPLACE)) > + /* replace_prog implies BPF_F_REPLACE, and vice versa */ > + return -EINVAL; > > if (!hierarchy_allows_attach(cgrp, type)) > return -EPERM; > @@ -374,26 +447,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) > return -E2BIG; > > - if (flags & BPF_F_ALLOW_MULTI) { > - list_for_each_entry(pl, progs, node) { > - if (pl->prog == prog) > - /* disallow attaching the same prog twice */ > - return -EINVAL; > - if (pl->prog == replace_prog) > - replace_pl = pl; > - } > - if ((flags & BPF_F_REPLACE) && !replace_pl) > - /* prog to replace not found for cgroup */ > - return -ENOENT; > - } else if (!list_empty(progs)) { > - replace_pl = list_first_entry(progs, typeof(*pl), node); > - } > + pl = find_attach_entry(progs, prog, link, replace_prog, > + flags & BPF_F_ALLOW_MULTI); > + if (IS_ERR(pl)) > + return PTR_ERR(pl); > > - if (bpf_cgroup_storages_alloc(storage, prog)) > + if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog)) > return -ENOMEM; > > - if (replace_pl) { > - pl = replace_pl; > + if (pl) { > old_prog = pl->prog; > bpf_cgroup_storages_unlink(pl->storage); > bpf_cgroup_storages_assign(old_storage, pl->storage); > @@ -407,6 +469,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > } > > pl->prog = prog; > + pl->link = link; > bpf_cgroup_storages_assign(pl->storage, storage); > cgrp->bpf.flags[type] = saved_flags; > > @@ -414,80 +477,93 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > if (err) > goto cleanup; > > - static_branch_inc(&cgroup_bpf_enabled_key); > bpf_cgroup_storages_free(old_storage); > - if (old_prog) { > + if (old_prog) > bpf_prog_put(old_prog); > - static_branch_dec(&cgroup_bpf_enabled_key); > - } > - bpf_cgroup_storages_link(storage, cgrp, type); > + else > + static_branch_inc(&cgroup_bpf_enabled_key); > + bpf_cgroup_storages_link(pl->storage, cgrp, type); > return 0; > > cleanup: > - /* and cleanup the prog list */ > - pl->prog = old_prog; > + if (old_prog) { > + pl->prog = old_prog; > + pl->link = NULL; > + } > bpf_cgroup_storages_free(pl->storage); > bpf_cgroup_storages_assign(pl->storage, old_storage); > bpf_cgroup_storages_link(pl->storage, cgrp, type); > - if (!replace_pl) { > + if (!old_prog) { > list_del(&pl->node); > kfree(pl); > } > return err; > } > > +static struct bpf_prog_list *find_detach_entry(struct list_head *progs, > + struct bpf_prog *prog, > + struct bpf_cgroup_link *link, > + bool allow_multi) > +{ > + struct bpf_prog_list *pl; > + > + if (!allow_multi) { > + if (list_empty(progs)) > + /* report error when trying to detach and nothing is attached */ > + return ERR_PTR(-ENOENT); > + > + /* to maintain backward compatibility NONE and OVERRIDE cgroups > + * allow detaching with invalid FD (prog==NULL) in legacy mode > + */ > + return list_first_entry(progs, typeof(*pl), node); > + } > + > + if (!prog && !link) > + /* to detach MULTI prog the user has to specify valid FD > + * of the program or link to be detached > + */ > + return ERR_PTR(-EINVAL); > + > + /* find the prog or link and detach it */ > + list_for_each_entry(pl, progs, node) { > + if (pl->prog == prog && pl->link == link) > + return pl; > + } > + return ERR_PTR(-ENOENT); > +} > + > /** > - * __cgroup_bpf_detach() - Detach the program from a cgroup, and > + * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and > * propagate the change to descendants > * @cgrp: The cgroup which descendants to traverse > * @prog: A program to detach or NULL > + * @prog: A link to detach or NULL > * @type: Type of detach operation > * > + * At most one of @prog or @link can be non-NULL. > * Must be called with cgroup_mutex held. > */ > int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, > - enum bpf_attach_type type) > + struct bpf_cgroup_link *link, enum bpf_attach_type type) > { > struct list_head *progs = &cgrp->bpf.progs[type]; > u32 flags = cgrp->bpf.flags[type]; > - struct bpf_prog *old_prog = NULL; > struct bpf_prog_list *pl; > + struct bpf_prog *old_prog; > int err; > > - if (flags & BPF_F_ALLOW_MULTI) { > - if (!prog) > - /* to detach MULTI prog the user has to specify valid FD > - * of the program to be detached > - */ > - return -EINVAL; > - } else { > - if (list_empty(progs)) > - /* report error when trying to detach and nothing is attached */ > - return -ENOENT; > - } > + if (prog && link) > + /* only one of prog or link can be specified */ > + return -EINVAL; > > - if (flags & BPF_F_ALLOW_MULTI) { > - /* find the prog and detach it */ > - list_for_each_entry(pl, progs, node) { > - if (pl->prog != prog) > - continue; > - old_prog = prog; > - /* mark it deleted, so it's ignored while > - * recomputing effective > - */ > - pl->prog = NULL; > - break; > - } > - if (!old_prog) > - return -ENOENT; > - } else { > - /* to maintain backward compatibility NONE and OVERRIDE cgroups > - * allow detaching with invalid FD (prog==NULL) > - */ > - pl = list_first_entry(progs, typeof(*pl), node); > - old_prog = pl->prog; > - pl->prog = NULL; > - } > + pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI); > + if (IS_ERR(pl)) > + return PTR_ERR(pl); > + > + /* mark it deleted, so it's ignored while recomputing effective */ > + old_prog = pl->prog; > + pl->prog = NULL; > + pl->link = NULL; > > err = update_effective_progs(cgrp, type); > if (err) > @@ -501,14 +577,15 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, > if (list_empty(progs)) > /* last program was detached, reset flags to zero */ > cgrp->bpf.flags[type] = 0; > - > - bpf_prog_put(old_prog); > + if (old_prog) > + bpf_prog_put(old_prog); > static_branch_dec(&cgroup_bpf_enabled_key); > return 0; > > cleanup: > - /* and restore back old_prog */ > + /* restore back prog or link */ > pl->prog = old_prog; > + pl->link = link; > return err; > } > > @@ -521,6 +598,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > struct list_head *progs = &cgrp->bpf.progs[type]; > u32 flags = cgrp->bpf.flags[type]; > struct bpf_prog_array *effective; > + struct bpf_prog *prog; > int cnt, ret = 0, i; > > effective = rcu_dereference_protected(cgrp->bpf.effective[type], > @@ -551,7 +629,8 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > > i = 0; > list_for_each_entry(pl, progs, node) { > - id = pl->prog->aux->id; > + prog = prog_list_prog(pl); > + id = prog->aux->id; > if (copy_to_user(prog_ids + i, &id, sizeof(id))) > return -EFAULT; > if (++i == cnt) > @@ -581,8 +660,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, > } > } > > - ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type, > - attr->attach_flags); > + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, > + attr->attach_type, attr->attach_flags); > > if (replace_prog) > bpf_prog_put(replace_prog); > @@ -604,7 +683,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) > if (IS_ERR(prog)) > prog = NULL; > > - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); > + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type); > if (prog) > bpf_prog_put(prog); > > @@ -612,6 +691,90 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) > return ret; > } > > +static void bpf_cgroup_link_release(struct bpf_link *link) > +{ > + struct bpf_cgroup_link *cg_link = > + container_of(link, struct bpf_cgroup_link, link); > + > + /* link might have been auto-detached by dying cgroup already, > + * in that case our work is done here > + */ > + if (!cg_link->cgroup) > + return; > + > + mutex_lock(&cgroup_mutex); > + > + /* re-check cgroup under lock again */ > + if (!cg_link->cgroup) { > + mutex_unlock(&cgroup_mutex); > + return; > + } > + > + WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, > + cg_link->type)); > + > + mutex_unlock(&cgroup_mutex); > + cgroup_put(cg_link->cgroup); > +} > + > +static void bpf_cgroup_link_dealloc(struct bpf_link *link) > +{ > + struct bpf_cgroup_link *cg_link = > + container_of(link, struct bpf_cgroup_link, link); > + > + kfree(cg_link); > +} > + > +const struct bpf_link_ops bpf_cgroup_link_lops = { > + .release = bpf_cgroup_link_release, > + .dealloc = bpf_cgroup_link_dealloc, > +}; > + > +int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) > +{ > + struct bpf_cgroup_link *link; > + struct file *link_file; > + struct cgroup *cgrp; > + int err, link_fd; > + > + if (attr->link_create.flags) > + return -EINVAL; > + > + cgrp = cgroup_get_from_fd(attr->link_create.target_fd); > + if (IS_ERR(cgrp)) > + return PTR_ERR(cgrp); > + > + link = kzalloc(sizeof(*link), GFP_USER); > + if (!link) { > + err = -ENOMEM; > + goto out_put_cgroup; > + } > + bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); > + link->cgroup = cgrp; > + link->type = attr->link_create.attach_type; > + > + link_file = bpf_link_new_file(&link->link, &link_fd); > + if (IS_ERR(link_file)) { > + kfree(link); > + err = PTR_ERR(link_file); > + goto out_put_cgroup; > + } > + > + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, > + BPF_F_ALLOW_MULTI); > + if (err) { > + bpf_link_cleanup(&link->link, link_file, link_fd); > + goto out_put_cgroup; > + } > + > + fd_install(link_fd, link_file); > + return link_fd; > + > +out_put_cgroup: > + cgroup_put(cgrp); > + return err; > +} > + > int cgroup_bpf_prog_query(const union bpf_attr *attr, > union bpf_attr __user *uattr) > { > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index a616b63f23b4..05412b83ed6c 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -2175,13 +2175,6 @@ static int bpf_obj_get(const union bpf_attr *attr) > attr->file_flags); > } > > -struct bpf_link { > - atomic64_t refcnt; > - const struct bpf_link_ops *ops; > - struct bpf_prog *prog; > - struct work_struct work; > -}; > - > void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, > struct bpf_prog *prog) > { > @@ -2195,8 +2188,8 @@ void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, > * anon_inode's release() call. This helper manages marking bpf_link as > * defunct, releases anon_inode file and puts reserved FD. > */ > -static void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, > - int link_fd) > +void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, > + int link_fd) > { > link->prog = NULL; > fput(link_file); > @@ -2266,6 +2259,10 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) > link_type = "raw_tracepoint"; > else if (link->ops == &bpf_tracing_link_lops) > link_type = "tracing"; > +#ifdef CONFIG_CGROUP_BPF > + else if (link->ops == &bpf_cgroup_link_lops) > + link_type = "cgroup"; > +#endif > else > link_type = "unknown"; > > @@ -3553,6 +3550,49 @@ static int bpf_map_do_batch(const union bpf_attr *attr, > return err; > } > > +#define BPF_LINK_CREATE_LAST_FIELD link_create.flags > +static int link_create(union bpf_attr *attr) > +{ >From what I see this function does not check any capability whether the existing bpf_prog_attach() checks for CAP_NET_ADMIN. This is pretty importnant difference but I don't see it clarified in the commit message or discussed (or I missed it?). Having a way to attach cgroup bpf prog by non-priv users is actually helpful in some use-cases, e.g. systemd required patching in the past to make it work with user (non-priv) sessions, see [0]. But in other cases it's also useful to limit the ability to attach programs to a cgroup while using bpf_link so that only the thing that controls cgroup setup can attach but not any non-priv process running in that cgroup. How is this use-case covered in BPF_LINK_CREATE? [0] https://github.com/systemd/systemd/pull/12745 > + enum bpf_prog_type ptype; > + struct bpf_prog *prog; > + int ret; > + > + if (CHECK_ATTR(BPF_LINK_CREATE)) > + return -EINVAL; > + > + ptype = attach_type_to_prog_type(attr->link_create.attach_type); > + if (ptype == BPF_PROG_TYPE_UNSPEC) > + return -EINVAL; > + > + prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype); > + if (IS_ERR(prog)) > + return PTR_ERR(prog); > + > + ret = bpf_prog_attach_check_attach_type(prog, > + attr->link_create.attach_type); > + if (ret) > + goto err_out; > + > + switch (ptype) { > + case BPF_PROG_TYPE_CGROUP_SKB: > + case BPF_PROG_TYPE_CGROUP_SOCK: > + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: > + case BPF_PROG_TYPE_SOCK_OPS: > + case BPF_PROG_TYPE_CGROUP_DEVICE: > + case BPF_PROG_TYPE_CGROUP_SYSCTL: > + case BPF_PROG_TYPE_CGROUP_SOCKOPT: > + ret = cgroup_bpf_link_attach(attr, prog); > + break; > + default: > + ret = -EINVAL; > + } > + > +err_out: > + if (ret < 0) > + bpf_prog_put(prog); > + return ret; > +} > + > SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) > { > union bpf_attr attr = {}; > @@ -3663,6 +3703,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz > case BPF_MAP_DELETE_BATCH: > err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); > break; > + case BPF_LINK_CREATE: > + err = link_create(&attr); > + break; > default: > err = -EINVAL; > break; > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index 3dead0416b91..219624fba9ba 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -6303,27 +6303,31 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) > #endif /* CONFIG_SOCK_CGROUP_DATA */ > > #ifdef CONFIG_CGROUP_BPF > -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > - struct bpf_prog *replace_prog, enum bpf_attach_type type, > +int cgroup_bpf_attach(struct cgroup *cgrp, > + struct bpf_prog *prog, struct bpf_prog *replace_prog, > + struct bpf_cgroup_link *link, > + enum bpf_attach_type type, > u32 flags) > { > int ret; > > mutex_lock(&cgroup_mutex); > - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags); > + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); > mutex_unlock(&cgroup_mutex); > return ret; > } > + > int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, > - enum bpf_attach_type type, u32 flags) > + enum bpf_attach_type type) > { > int ret; > > mutex_lock(&cgroup_mutex); > - ret = __cgroup_bpf_detach(cgrp, prog, type); > + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); > mutex_unlock(&cgroup_mutex); > return ret; > } > + > int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > union bpf_attr __user *uattr) > { > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index f1fbc36f58d3..8b3f1c098ac0 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -111,6 +111,7 @@ enum bpf_cmd { > BPF_MAP_LOOKUP_AND_DELETE_BATCH, > BPF_MAP_UPDATE_BATCH, > BPF_MAP_DELETE_BATCH, > + BPF_LINK_CREATE, > }; > > enum bpf_map_type { > @@ -541,7 +542,7 @@ union bpf_attr { > __u32 prog_cnt; > } query; > > - struct { > + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ > __u64 name; > __u32 prog_fd; > } raw_tracepoint; > @@ -569,6 +570,13 @@ union bpf_attr { > __u64 probe_offset; /* output: probe_offset */ > __u64 probe_addr; /* output: probe_addr */ > } task_fd_query; > + > + struct { /* struct used by BPF_LINK_CREATE command */ > + __u32 prog_fd; /* eBPF program to attach */ > + __u32 target_fd; /* object to attach to */ > + __u32 attach_type; /* attach type */ > + __u32 flags; /* extra flags */ > + } link_create; > } __attribute__((aligned(8))); > > /* The description below is an attempt at providing documentation to eBPF > -- > 2.17.1 > -- Andrey Ignatov