On Thu, May 11, 2023 at 8:57 AM Stanislav Fomichev <sdf@xxxxxxxxxx> wrote: > > We're observing some stalls on the heavily loaded machines > in the cgroup_bpf_prog_query path. This is likely due to > being blocked on cgroup_mutex. > > IIUC, the cgroup_mutex is there mostly to protect the non-effective > fields (cgrp->bpf.progs) which might be changed by the update path. > For the BPF_F_QUERY_EFFECTIVE case, all we need is to rcu_dereference > a bunch of pointers (and keep them around for consistency), so > let's do it. > > Sending out as an RFC because it looks a bit ugly. It would also > be nice to handle non-effective case locklessly as well, but it > might require a larger rework. Oops, please ignore this one. Somehow got into my unrelated sockopt series.. > Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx> > --- > include/linux/bpf-cgroup-defs.h | 2 +- > include/linux/bpf-cgroup.h | 1 + > kernel/bpf/cgroup.c | 80 +++++++++++++++++---------------- > 3 files changed, 44 insertions(+), 39 deletions(-) > > diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h > index 7b121bd780eb..df0b8faa1a17 100644 > --- a/include/linux/bpf-cgroup-defs.h > +++ b/include/linux/bpf-cgroup-defs.h > @@ -56,7 +56,7 @@ struct cgroup_bpf { > * have either zero or one element > * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS > */ > - struct hlist_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; > + struct hlist_head __rcu progs[MAX_CGROUP_BPF_ATTACH_TYPE]; > u8 flags[MAX_CGROUP_BPF_ATTACH_TYPE]; > > /* list of cgroup shared storages */ > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h > index 57e9e109257e..555e9cbb3a05 100644 > --- a/include/linux/bpf-cgroup.h > +++ b/include/linux/bpf-cgroup.h > @@ -106,6 +106,7 @@ struct bpf_prog_list { > struct bpf_prog *prog; > struct bpf_cgroup_link *link; > struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; > + struct rcu_head rcu; > }; > > int cgroup_bpf_inherit(struct cgroup *cgrp); > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c > index a06e118a9be5..92a1b33dcc06 100644 > --- a/kernel/bpf/cgroup.c > +++ b/kernel/bpf/cgroup.c > @@ -285,12 +285,15 @@ static void cgroup_bpf_release(struct work_struct *work) > mutex_lock(&cgroup_mutex); > > for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { > - struct hlist_head *progs = &cgrp->bpf.progs[atype]; > + struct hlist_head *progs; > struct bpf_prog_list *pl; > struct hlist_node *pltmp; > > + progs = rcu_dereference_protected(&cgrp->bpf.progs[atype], > + lockdep_is_held(&cgroup_mutex)); > + > hlist_for_each_entry_safe(pl, pltmp, progs, node) { > - hlist_del(&pl->node); > + hlist_del_rcu(&pl->node); > if (pl->prog) { > if (pl->prog->expected_attach_type == BPF_LSM_CGROUP) > bpf_trampoline_unlink_cgroup_shim(pl->prog); > @@ -301,7 +304,7 @@ static void cgroup_bpf_release(struct work_struct *work) > bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog); > bpf_cgroup_link_auto_detach(pl->link); > } > - kfree(pl); > + kfree_rcu(pl, rcu); > static_branch_dec(&cgroup_bpf_enabled_key[atype]); > } > old_array = rcu_dereference_protected( > @@ -352,12 +355,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) > /* count number of elements in the list. > * it's slow but the list cannot be long > */ > -static u32 prog_list_length(struct hlist_head *head) > +static u32 prog_list_length(struct hlist_head __rcu *head) > { > struct bpf_prog_list *pl; > u32 cnt = 0; > > - hlist_for_each_entry(pl, head, node) { > + hlist_for_each_entry_rcu(pl, head, node) { > if (!prog_list_prog(pl)) > continue; > cnt++; > @@ -553,7 +556,7 @@ static int update_effective_progs(struct cgroup *cgrp, > > #define BPF_CGROUP_MAX_PROGS 64 > > -static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, > +static struct bpf_prog_list *find_attach_entry(struct hlist_head __rcu *progs, > struct bpf_prog *prog, > struct bpf_cgroup_link *link, > struct bpf_prog *replace_prog, > @@ -565,10 +568,10 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, > if (!allow_multi) { > if (hlist_empty(progs)) > return NULL; > - return hlist_entry(progs->first, typeof(*pl), node); > + return hlist_entry(rcu_dereference_raw(progs)->first, typeof(*pl), node); > } > > - hlist_for_each_entry(pl, progs, node) { > + hlist_for_each_entry_rcu(pl, progs, node) { > if (prog && pl->prog == prog && prog != replace_prog) > /* disallow attaching the same prog twice */ > return ERR_PTR(-EINVAL); > @@ -579,7 +582,7 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, > > /* direct prog multi-attach w/ replacement case */ > if (replace_prog) { > - hlist_for_each_entry(pl, progs, node) { > + hlist_for_each_entry_rcu(pl, progs, node) { > if (pl->prog == replace_prog) > /* a match found */ > return pl; > @@ -615,8 +618,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, > struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; > struct bpf_prog *new_prog = prog ? : link->link.prog; > enum cgroup_bpf_attach_type atype; > + struct hlist_head __rcu *progs; > struct bpf_prog_list *pl; > - struct hlist_head *progs; > int err; > > if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || > @@ -658,6 +661,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, > prog ? : link->link.prog, cgrp)) > return -ENOMEM; > > + WRITE_ONCE(cgrp->bpf.flags[atype], saved_flags); > if (pl) { > old_prog = pl->prog; > } else { > @@ -669,12 +673,12 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, > return -ENOMEM; > } > if (hlist_empty(progs)) > - hlist_add_head(&pl->node, progs); > + hlist_add_head_rcu(&pl->node, progs); > else > - hlist_for_each(last, progs) { > + __hlist_for_each_rcu(last, progs) { > if (last->next) > continue; > - hlist_add_behind(&pl->node, last); > + hlist_add_behind_rcu(&pl->node, last); > break; > } > } > @@ -682,7 +686,6 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, > pl->prog = prog; > pl->link = link; > bpf_cgroup_storages_assign(pl->storage, storage); > - cgrp->bpf.flags[atype] = saved_flags; > > if (type == BPF_LSM_CGROUP) { > err = bpf_trampoline_link_cgroup_shim(new_prog, atype); > @@ -796,7 +799,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, > enum cgroup_bpf_attach_type atype; > struct bpf_prog *old_prog; > struct bpf_prog_list *pl; > - struct hlist_head *progs; > + struct hlist_head __rcu *progs; > bool found = false; > > atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id); > @@ -808,7 +811,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, > if (link->link.prog->type != new_prog->type) > return -EINVAL; > > - hlist_for_each_entry(pl, progs, node) { > + hlist_for_each_entry_rcu(pl, progs, node) { > if (pl->link == link) { > found = true; > break; > @@ -847,7 +850,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, > return ret; > } > > -static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs, > +static struct bpf_prog_list *find_detach_entry(struct hlist_head __rcu *progs, > struct bpf_prog *prog, > struct bpf_cgroup_link *link, > bool allow_multi) > @@ -862,7 +865,7 @@ static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs, > /* to maintain backward compatibility NONE and OVERRIDE cgroups > * allow detaching with invalid FD (prog==NULL) in legacy mode > */ > - return hlist_entry(progs->first, typeof(*pl), node); > + return hlist_entry(rcu_dereference_raw(progs)->first, typeof(*pl), node); > } > > if (!prog && !link) > @@ -872,7 +875,7 @@ static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs, > return ERR_PTR(-EINVAL); > > /* find the prog or link and detach it */ > - hlist_for_each_entry(pl, progs, node) { > + hlist_for_each_entry_rcu(pl, progs, node) { > if (pl->prog == prog && pl->link == link) > return pl; > } > @@ -894,9 +897,9 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, > enum cgroup_bpf_attach_type atype) > { > struct cgroup_subsys_state *css; > + struct hlist_head __rcu *head; > struct bpf_prog_array *progs; > struct bpf_prog_list *pl; > - struct hlist_head *head; > struct cgroup *cg; > int pos; > > @@ -913,7 +916,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, > continue; > > head = &cg->bpf.progs[atype]; > - hlist_for_each_entry(pl, head, node) { > + hlist_for_each_entry_rcu(pl, head, node) { > if (!prog_list_prog(pl)) > continue; > if (pl->prog == prog && pl->link == link) > @@ -950,9 +953,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, > struct bpf_cgroup_link *link, enum bpf_attach_type type) > { > enum cgroup_bpf_attach_type atype; > + struct hlist_head __rcu *progs; > struct bpf_prog *old_prog; > struct bpf_prog_list *pl; > - struct hlist_head *progs; > u32 attach_btf_id = 0; > u32 flags; > > @@ -989,12 +992,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, > } > > /* now can actually delete it from this cgroup list */ > - hlist_del(&pl->node); > + hlist_del_rcu(&pl->node); > > kfree(pl); > if (hlist_empty(progs)) > /* last program was detached, reset flags to zero */ > - cgrp->bpf.flags[atype] = 0; > + WRITE_ONCE(cgrp->bpf.flags[atype], 0); > if (old_prog) { > if (type == BPF_LSM_CGROUP) > bpf_trampoline_unlink_cgroup_shim(old_prog); > @@ -1022,10 +1025,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); > bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; > __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); > + struct bpf_prog_array *effective[MAX_CGROUP_BPF_ATTACH_TYPE]; > enum bpf_attach_type type = attr->query.attach_type; > enum cgroup_bpf_attach_type from_atype, to_atype; > enum cgroup_bpf_attach_type atype; > - struct bpf_prog_array *effective; > int cnt, ret = 0, i; > int total_cnt = 0; > u32 flags; > @@ -1046,14 +1049,15 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > if (from_atype < 0) > return -EINVAL; > to_atype = from_atype; > - flags = cgrp->bpf.flags[from_atype]; > + flags = READ_ONCE(cgrp->bpf.flags[from_atype]); > } > > for (atype = from_atype; atype <= to_atype; atype++) { > if (effective_query) { > - effective = rcu_dereference_protected(cgrp->bpf.effective[atype], > - lockdep_is_held(&cgroup_mutex)); > - total_cnt += bpf_prog_array_length(effective); > + effective[atype] = rcu_dereference_protected( > + cgrp->bpf.effective[atype], > + lockdep_is_held(&cgroup_mutex)); > + total_cnt += bpf_prog_array_length(effective[atype]); > } else { > total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); > } > @@ -1076,12 +1080,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > > for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { > if (effective_query) { > - effective = rcu_dereference_protected(cgrp->bpf.effective[atype], > - lockdep_is_held(&cgroup_mutex)); > - cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); > - ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt); > + cnt = min_t(int, bpf_prog_array_length(effective[atype]), total_cnt); > + ret = bpf_prog_array_copy_to_user(effective[atype], prog_ids, cnt); > } else { > - struct hlist_head *progs; > + struct hlist_head __rcu *progs; > struct bpf_prog_list *pl; > struct bpf_prog *prog; > u32 id; > @@ -1089,7 +1091,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > progs = &cgrp->bpf.progs[atype]; > cnt = min_t(int, prog_list_length(progs), total_cnt); > i = 0; > - hlist_for_each_entry(pl, progs, node) { > + hlist_for_each_entry_rcu(pl, progs, node) { > prog = prog_list_prog(pl); > id = prog->aux->id; > if (copy_to_user(prog_ids + i, &id, sizeof(id))) > @@ -1099,7 +1101,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > } > > if (prog_attach_flags) { > - flags = cgrp->bpf.flags[atype]; > + flags = READ_ONCE(cgrp->bpf.flags[atype]); > > for (i = 0; i < cnt; i++) > if (copy_to_user(prog_attach_flags + i, > @@ -1112,6 +1114,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > prog_ids += cnt; > total_cnt -= cnt; > } > + if (total_cnt != 0) > + return -EAGAIN; /* raced with the detach */ > return ret; > } > > @@ -1120,9 +1124,9 @@ static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > { > int ret; > > - mutex_lock(&cgroup_mutex); > + rcu_read_lock(); > ret = __cgroup_bpf_query(cgrp, attr, uattr); > - mutex_unlock(&cgroup_mutex); > + rcu_read_unlock(); > return ret; > } > > -- > 2.40.1.521.gf1e218fcd8-goog >