Re: [PATCH bpf-next] RFC: bpf: query effective progs without cgroup_mutex

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, May 11, 2023 at 8:57 AM Stanislav Fomichev <sdf@xxxxxxxxxx> wrote:
>
> We're observing some stalls on the heavily loaded machines
> in the cgroup_bpf_prog_query path. This is likely due to
> being blocked on cgroup_mutex.
>
> IIUC, the cgroup_mutex is there mostly to protect the non-effective
> fields (cgrp->bpf.progs) which might be changed by the update path.
> For the BPF_F_QUERY_EFFECTIVE case, all we need is to rcu_dereference
> a bunch of pointers (and keep them around for consistency), so
> let's do it.
>
> Sending out as an RFC because it looks a bit ugly. It would also
> be nice to handle non-effective case locklessly as well, but it
> might require a larger rework.

Oops, please ignore this one. Somehow got into my unrelated sockopt series..

> Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx>
> ---
>  include/linux/bpf-cgroup-defs.h |  2 +-
>  include/linux/bpf-cgroup.h      |  1 +
>  kernel/bpf/cgroup.c             | 80 +++++++++++++++++----------------
>  3 files changed, 44 insertions(+), 39 deletions(-)
>
> diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
> index 7b121bd780eb..df0b8faa1a17 100644
> --- a/include/linux/bpf-cgroup-defs.h
> +++ b/include/linux/bpf-cgroup-defs.h
> @@ -56,7 +56,7 @@ struct cgroup_bpf {
>          * have either zero or one element
>          * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
>          */
> -       struct hlist_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
> +       struct hlist_head __rcu progs[MAX_CGROUP_BPF_ATTACH_TYPE];
>         u8 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
>
>         /* list of cgroup shared storages */
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 57e9e109257e..555e9cbb3a05 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -106,6 +106,7 @@ struct bpf_prog_list {
>         struct bpf_prog *prog;
>         struct bpf_cgroup_link *link;
>         struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
> +       struct rcu_head rcu;
>  };
>
>  int cgroup_bpf_inherit(struct cgroup *cgrp);
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index a06e118a9be5..92a1b33dcc06 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -285,12 +285,15 @@ static void cgroup_bpf_release(struct work_struct *work)
>         mutex_lock(&cgroup_mutex);
>
>         for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
> -               struct hlist_head *progs = &cgrp->bpf.progs[atype];
> +               struct hlist_head *progs;
>                 struct bpf_prog_list *pl;
>                 struct hlist_node *pltmp;
>
> +               progs = rcu_dereference_protected(&cgrp->bpf.progs[atype],
> +                                                 lockdep_is_held(&cgroup_mutex));
> +
>                 hlist_for_each_entry_safe(pl, pltmp, progs, node) {
> -                       hlist_del(&pl->node);
> +                       hlist_del_rcu(&pl->node);
>                         if (pl->prog) {
>                                 if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
>                                         bpf_trampoline_unlink_cgroup_shim(pl->prog);
> @@ -301,7 +304,7 @@ static void cgroup_bpf_release(struct work_struct *work)
>                                         bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
>                                 bpf_cgroup_link_auto_detach(pl->link);
>                         }
> -                       kfree(pl);
> +                       kfree_rcu(pl, rcu);
>                         static_branch_dec(&cgroup_bpf_enabled_key[atype]);
>                 }
>                 old_array = rcu_dereference_protected(
> @@ -352,12 +355,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
>  /* count number of elements in the list.
>   * it's slow but the list cannot be long
>   */
> -static u32 prog_list_length(struct hlist_head *head)
> +static u32 prog_list_length(struct hlist_head __rcu *head)
>  {
>         struct bpf_prog_list *pl;
>         u32 cnt = 0;
>
> -       hlist_for_each_entry(pl, head, node) {
> +       hlist_for_each_entry_rcu(pl, head, node) {
>                 if (!prog_list_prog(pl))
>                         continue;
>                 cnt++;
> @@ -553,7 +556,7 @@ static int update_effective_progs(struct cgroup *cgrp,
>
>  #define BPF_CGROUP_MAX_PROGS 64
>
> -static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
> +static struct bpf_prog_list *find_attach_entry(struct hlist_head __rcu *progs,
>                                                struct bpf_prog *prog,
>                                                struct bpf_cgroup_link *link,
>                                                struct bpf_prog *replace_prog,
> @@ -565,10 +568,10 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
>         if (!allow_multi) {
>                 if (hlist_empty(progs))
>                         return NULL;
> -               return hlist_entry(progs->first, typeof(*pl), node);
> +               return hlist_entry(rcu_dereference_raw(progs)->first, typeof(*pl), node);
>         }
>
> -       hlist_for_each_entry(pl, progs, node) {
> +       hlist_for_each_entry_rcu(pl, progs, node) {
>                 if (prog && pl->prog == prog && prog != replace_prog)
>                         /* disallow attaching the same prog twice */
>                         return ERR_PTR(-EINVAL);
> @@ -579,7 +582,7 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
>
>         /* direct prog multi-attach w/ replacement case */
>         if (replace_prog) {
> -               hlist_for_each_entry(pl, progs, node) {
> +               hlist_for_each_entry_rcu(pl, progs, node) {
>                         if (pl->prog == replace_prog)
>                                 /* a match found */
>                                 return pl;
> @@ -615,8 +618,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
>         struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
>         struct bpf_prog *new_prog = prog ? : link->link.prog;
>         enum cgroup_bpf_attach_type atype;
> +       struct hlist_head __rcu *progs;
>         struct bpf_prog_list *pl;
> -       struct hlist_head *progs;
>         int err;
>
>         if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
> @@ -658,6 +661,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
>                                       prog ? : link->link.prog, cgrp))
>                 return -ENOMEM;
>
> +       WRITE_ONCE(cgrp->bpf.flags[atype], saved_flags);
>         if (pl) {
>                 old_prog = pl->prog;
>         } else {
> @@ -669,12 +673,12 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
>                         return -ENOMEM;
>                 }
>                 if (hlist_empty(progs))
> -                       hlist_add_head(&pl->node, progs);
> +                       hlist_add_head_rcu(&pl->node, progs);
>                 else
> -                       hlist_for_each(last, progs) {
> +                       __hlist_for_each_rcu(last, progs) {
>                                 if (last->next)
>                                         continue;
> -                               hlist_add_behind(&pl->node, last);
> +                               hlist_add_behind_rcu(&pl->node, last);
>                                 break;
>                         }
>         }
> @@ -682,7 +686,6 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
>         pl->prog = prog;
>         pl->link = link;
>         bpf_cgroup_storages_assign(pl->storage, storage);
> -       cgrp->bpf.flags[atype] = saved_flags;
>
>         if (type == BPF_LSM_CGROUP) {
>                 err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
> @@ -796,7 +799,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
>         enum cgroup_bpf_attach_type atype;
>         struct bpf_prog *old_prog;
>         struct bpf_prog_list *pl;
> -       struct hlist_head *progs;
> +       struct hlist_head __rcu *progs;
>         bool found = false;
>
>         atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
> @@ -808,7 +811,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
>         if (link->link.prog->type != new_prog->type)
>                 return -EINVAL;
>
> -       hlist_for_each_entry(pl, progs, node) {
> +       hlist_for_each_entry_rcu(pl, progs, node) {
>                 if (pl->link == link) {
>                         found = true;
>                         break;
> @@ -847,7 +850,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
>         return ret;
>  }
>
> -static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
> +static struct bpf_prog_list *find_detach_entry(struct hlist_head __rcu *progs,
>                                                struct bpf_prog *prog,
>                                                struct bpf_cgroup_link *link,
>                                                bool allow_multi)
> @@ -862,7 +865,7 @@ static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
>                 /* to maintain backward compatibility NONE and OVERRIDE cgroups
>                  * allow detaching with invalid FD (prog==NULL) in legacy mode
>                  */
> -               return hlist_entry(progs->first, typeof(*pl), node);
> +               return hlist_entry(rcu_dereference_raw(progs)->first, typeof(*pl), node);
>         }
>
>         if (!prog && !link)
> @@ -872,7 +875,7 @@ static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
>                 return ERR_PTR(-EINVAL);
>
>         /* find the prog or link and detach it */
> -       hlist_for_each_entry(pl, progs, node) {
> +       hlist_for_each_entry_rcu(pl, progs, node) {
>                 if (pl->prog == prog && pl->link == link)
>                         return pl;
>         }
> @@ -894,9 +897,9 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
>                                   enum cgroup_bpf_attach_type atype)
>  {
>         struct cgroup_subsys_state *css;
> +       struct hlist_head __rcu *head;
>         struct bpf_prog_array *progs;
>         struct bpf_prog_list *pl;
> -       struct hlist_head *head;
>         struct cgroup *cg;
>         int pos;
>
> @@ -913,7 +916,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
>                                 continue;
>
>                         head = &cg->bpf.progs[atype];
> -                       hlist_for_each_entry(pl, head, node) {
> +                       hlist_for_each_entry_rcu(pl, head, node) {
>                                 if (!prog_list_prog(pl))
>                                         continue;
>                                 if (pl->prog == prog && pl->link == link)
> @@ -950,9 +953,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
>                                struct bpf_cgroup_link *link, enum bpf_attach_type type)
>  {
>         enum cgroup_bpf_attach_type atype;
> +       struct hlist_head __rcu *progs;
>         struct bpf_prog *old_prog;
>         struct bpf_prog_list *pl;
> -       struct hlist_head *progs;
>         u32 attach_btf_id = 0;
>         u32 flags;
>
> @@ -989,12 +992,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
>         }
>
>         /* now can actually delete it from this cgroup list */
> -       hlist_del(&pl->node);
> +       hlist_del_rcu(&pl->node);
>
>         kfree(pl);
>         if (hlist_empty(progs))
>                 /* last program was detached, reset flags to zero */
> -               cgrp->bpf.flags[atype] = 0;
> +               WRITE_ONCE(cgrp->bpf.flags[atype], 0);
>         if (old_prog) {
>                 if (type == BPF_LSM_CGROUP)
>                         bpf_trampoline_unlink_cgroup_shim(old_prog);
> @@ -1022,10 +1025,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>         __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
>         bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
>         __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
> +       struct bpf_prog_array *effective[MAX_CGROUP_BPF_ATTACH_TYPE];
>         enum bpf_attach_type type = attr->query.attach_type;
>         enum cgroup_bpf_attach_type from_atype, to_atype;
>         enum cgroup_bpf_attach_type atype;
> -       struct bpf_prog_array *effective;
>         int cnt, ret = 0, i;
>         int total_cnt = 0;
>         u32 flags;
> @@ -1046,14 +1049,15 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>                 if (from_atype < 0)
>                         return -EINVAL;
>                 to_atype = from_atype;
> -               flags = cgrp->bpf.flags[from_atype];
> +               flags = READ_ONCE(cgrp->bpf.flags[from_atype]);
>         }
>
>         for (atype = from_atype; atype <= to_atype; atype++) {
>                 if (effective_query) {
> -                       effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
> -                                                             lockdep_is_held(&cgroup_mutex));
> -                       total_cnt += bpf_prog_array_length(effective);
> +                       effective[atype] = rcu_dereference_protected(
> +                                               cgrp->bpf.effective[atype],
> +                                               lockdep_is_held(&cgroup_mutex));
> +                       total_cnt += bpf_prog_array_length(effective[atype]);
>                 } else {
>                         total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
>                 }
> @@ -1076,12 +1080,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>
>         for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
>                 if (effective_query) {
> -                       effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
> -                                                             lockdep_is_held(&cgroup_mutex));
> -                       cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
> -                       ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
> +                       cnt = min_t(int, bpf_prog_array_length(effective[atype]), total_cnt);
> +                       ret = bpf_prog_array_copy_to_user(effective[atype], prog_ids, cnt);
>                 } else {
> -                       struct hlist_head *progs;
> +                       struct hlist_head __rcu *progs;
>                         struct bpf_prog_list *pl;
>                         struct bpf_prog *prog;
>                         u32 id;
> @@ -1089,7 +1091,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>                         progs = &cgrp->bpf.progs[atype];
>                         cnt = min_t(int, prog_list_length(progs), total_cnt);
>                         i = 0;
> -                       hlist_for_each_entry(pl, progs, node) {
> +                       hlist_for_each_entry_rcu(pl, progs, node) {
>                                 prog = prog_list_prog(pl);
>                                 id = prog->aux->id;
>                                 if (copy_to_user(prog_ids + i, &id, sizeof(id)))
> @@ -1099,7 +1101,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>                         }
>
>                         if (prog_attach_flags) {
> -                               flags = cgrp->bpf.flags[atype];
> +                               flags = READ_ONCE(cgrp->bpf.flags[atype]);
>
>                                 for (i = 0; i < cnt; i++)
>                                         if (copy_to_user(prog_attach_flags + i,
> @@ -1112,6 +1114,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>                 prog_ids += cnt;
>                 total_cnt -= cnt;
>         }
> +       if (total_cnt != 0)
> +               return -EAGAIN; /* raced with the detach */
>         return ret;
>  }
>
> @@ -1120,9 +1124,9 @@ static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>  {
>         int ret;
>
> -       mutex_lock(&cgroup_mutex);
> +       rcu_read_lock();
>         ret = __cgroup_bpf_query(cgrp, attr, uattr);
> -       mutex_unlock(&cgroup_mutex);
> +       rcu_read_unlock();
>         return ret;
>  }
>
> --
> 2.40.1.521.gf1e218fcd8-goog
>





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux