On Fri, May 5, 2023 at 11:45 AM Stanislav Fomichev <sdf@xxxxxxxxxx> wrote: > > We're observing some stalls on the heavily loaded machines > in the cgroup_bpf_prog_query path. This is likely due to > being blocked on cgroup_mutex. > > IIUC, the cgroup_mutex is there mostly to protect the non-effective > fields (cgrp->bpf.progs) which might be changed by the update path. > For the BPF_F_QUERY_EFFECTIVE case, all we need is to rcu_dereference > a bunch of pointers (and keep them around for consistency), so > let's do it. > > Sending out as an RFC because it looks a bit ugly. It would also > be nice to handle non-effective case locklessly as well, but it > might require a larger rework. > > Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx> > --- > kernel/bpf/cgroup.c | 30 ++++++++++++++++++++---------- > 1 file changed, 20 insertions(+), 10 deletions(-) > > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c > index a06e118a9be5..c9d4b66e2c15 100644 > --- a/kernel/bpf/cgroup.c > +++ b/kernel/bpf/cgroup.c > @@ -1022,10 +1022,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); > bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; > __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); > + struct bpf_prog_array *effective[MAX_CGROUP_BPF_ATTACH_TYPE]; > enum bpf_attach_type type = attr->query.attach_type; > enum cgroup_bpf_attach_type from_atype, to_atype; > enum cgroup_bpf_attach_type atype; > - struct bpf_prog_array *effective; > int cnt, ret = 0, i; > int total_cnt = 0; > u32 flags; > @@ -1051,9 +1051,9 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > > for (atype = from_atype; atype <= to_atype; atype++) { > if (effective_query) { > - effective = rcu_dereference_protected(cgrp->bpf.effective[atype], > - lockdep_is_held(&cgroup_mutex)); > - total_cnt += bpf_prog_array_length(effective); > + effective[atype] = rcu_dereference_protected(cgrp->bpf.effective[atype], > + lockdep_is_held(&cgroup_mutex)); > + total_cnt += bpf_prog_array_length(effective[atype]); > } else { > total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); > } > @@ -1076,10 +1076,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > > for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { > if (effective_query) { > - effective = rcu_dereference_protected(cgrp->bpf.effective[atype], > - lockdep_is_held(&cgroup_mutex)); > - cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); > - ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt); > + cnt = min_t(int, bpf_prog_array_length(effective[atype]), total_cnt); > + ret = bpf_prog_array_copy_to_user(effective[atype], prog_ids, cnt); > } else { > struct hlist_head *progs; > struct bpf_prog_list *pl; > @@ -1118,11 +1116,23 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > union bpf_attr __user *uattr) > { > + __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); > + bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; > + bool need_mutex = false; Oops, this has to be true, but you get the idea... > int ret; > > - mutex_lock(&cgroup_mutex); > + if (effective_query && !prog_attach_flags) > + need_mutex = false; > + > + if (need_mutex) > + mutex_lock(&cgroup_mutex); > + else > + rcu_read_lock(); > ret = __cgroup_bpf_query(cgrp, attr, uattr); > - mutex_unlock(&cgroup_mutex); > + if (need_mutex) > + mutex_unlock(&cgroup_mutex); > + else > + rcu_read_unlock(); > return ret; > } > > -- > 2.40.1.521.gf1e218fcd8-goog >