Re: [PATCH bpf-next v7 04/11] bpf: minimize number of allocated lsm slots per program

Stanislav Fomichev <sdf@xxxxxxxxxx> · Mon, 23 May 2022 19:14:42 -0700

On Fri, May 20, 2022 at 11:56 PM Martin KaFai Lau <kafai@xxxxxx> wrote:
>
> On Wed, May 18, 2022 at 03:55:24PM -0700, Stanislav Fomichev wrote:
> > Previous patch adds 1:1 mapping between all 211 LSM hooks
> > and bpf_cgroup program array. Instead of reserving a slot per
> > possible hook, reserve 10 slots per cgroup for lsm programs.
> > Those slots are dynamically allocated on demand and reclaimed.
> >
> > struct cgroup_bpf {
> >       struct bpf_prog_array *    effective[33];        /*     0   264 */
> >       /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
> >       struct hlist_head          progs[33];            /*   264   264 */
> >       /* --- cacheline 8 boundary (512 bytes) was 16 bytes ago --- */
> >       u8                         flags[33];            /*   528    33 */
> >
> >       /* XXX 7 bytes hole, try to pack */
> >
> >       struct list_head           storages;             /*   568    16 */
> >       /* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */
> >       struct bpf_prog_array *    inactive;             /*   584     8 */
> >       struct percpu_ref          refcnt;               /*   592    16 */
> >       struct work_struct         release_work;         /*   608    72 */
> >
> >       /* size: 680, cachelines: 11, members: 7 */
> >       /* sum members: 673, holes: 1, sum holes: 7 */
> >       /* last cacheline: 40 bytes */
> > };
> >
> > Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx>
> > ---
> >  include/linux/bpf-cgroup-defs.h |   3 +-
> >  include/linux/bpf_lsm.h         |   6 --
> >  kernel/bpf/bpf_lsm.c            |   5 --
> >  kernel/bpf/cgroup.c             | 135 +++++++++++++++++++++++++++++---
> >  4 files changed, 125 insertions(+), 24 deletions(-)
> >
> > diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
> > index d5a70a35dace..359d3f16abea 100644
> > --- a/include/linux/bpf-cgroup-defs.h
> > +++ b/include/linux/bpf-cgroup-defs.h
> > @@ -10,7 +10,8 @@
> >
> >  struct bpf_prog_array;
> >
> > -#define CGROUP_LSM_NUM 211 /* will be addressed in the next patch */
> > +/* Maximum number of concurrently attachable per-cgroup LSM hooks. */
> > +#define CGROUP_LSM_NUM 10
> >
> >  enum cgroup_bpf_attach_type {
> >       CGROUP_BPF_ATTACH_TYPE_INVALID = -1,
> > diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> > index 7f0e59f5f9be..613de44aa429 100644
> > --- a/include/linux/bpf_lsm.h
> > +++ b/include/linux/bpf_lsm.h
> > @@ -43,7 +43,6 @@ extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
> >  void bpf_inode_storage_free(struct inode *inode);
> >
> >  int bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func);
> > -int bpf_lsm_hook_idx(u32 btf_id);
> >
> >  #else /* !CONFIG_BPF_LSM */
> >
> > @@ -74,11 +73,6 @@ static inline int bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
> >       return -ENOENT;
> >  }
> >
> > -static inline int bpf_lsm_hook_idx(u32 btf_id)
> > -{
> > -     return -EINVAL;
> > -}
> > -
> >  #endif /* CONFIG_BPF_LSM */
> >
> >  #endif /* _LINUX_BPF_LSM_H */
> > diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> > index 654c23577ad3..96503c3e7a71 100644
> > --- a/kernel/bpf/bpf_lsm.c
> > +++ b/kernel/bpf/bpf_lsm.c
> > @@ -71,11 +71,6 @@ int bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
> >       return 0;
> >  }
> >
> > -int bpf_lsm_hook_idx(u32 btf_id)
> > -{
> > -     return btf_id_set_index(&bpf_lsm_hooks, btf_id);
> > -}
> > -
> >  int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
> >                       const struct bpf_prog *prog)
> >  {
> > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> > index 2c356a38f4cf..a959cdd22870 100644
> > --- a/kernel/bpf/cgroup.c
> > +++ b/kernel/bpf/cgroup.c
> > @@ -132,15 +132,110 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
> >  }
> >
> >  #ifdef CONFIG_BPF_LSM
> > +struct list_head unused_bpf_lsm_atypes;
> > +struct list_head used_bpf_lsm_atypes;
> > +
> > +struct bpf_lsm_attach_type {
> > +     int index;
> > +     u32 btf_id;
> > +     int usecnt;
> > +     struct list_head atypes;
> > +     struct rcu_head rcu_head;
> > +};
> > +
> > +static int __init bpf_lsm_attach_type_init(void)
> > +{
> > +     struct bpf_lsm_attach_type *atype;
> > +     int i;
> > +
> > +     INIT_LIST_HEAD_RCU(&unused_bpf_lsm_atypes);
> > +     INIT_LIST_HEAD_RCU(&used_bpf_lsm_atypes);
> > +
> > +     for (i = 0; i < CGROUP_LSM_NUM; i++) {
> > +             atype = kzalloc(sizeof(*atype), GFP_KERNEL);
> > +             if (!atype)
> > +                     continue;
> > +
> > +             atype->index = i;
> > +             list_add_tail_rcu(&atype->atypes, &unused_bpf_lsm_atypes);
> > +     }
> > +
> > +     return 0;
> > +}
> > +late_initcall(bpf_lsm_attach_type_init);
> > +
> >  static enum cgroup_bpf_attach_type bpf_lsm_attach_type_get(u32 attach_btf_id)
> >  {
> > -     return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id);
> > +     struct bpf_lsm_attach_type *atype;
> > +
> > +     lockdep_assert_held(&cgroup_mutex);
> > +
> > +     list_for_each_entry_rcu(atype, &used_bpf_lsm_atypes, atypes) {
> > +             if (atype->btf_id != attach_btf_id)
> > +                     continue;
> > +
> > +             atype->usecnt++;
> > +             return CGROUP_LSM_START + atype->index;
> > +     }
> > +
> > +     atype = list_first_or_null_rcu(&unused_bpf_lsm_atypes, struct bpf_lsm_attach_type, atypes);
> > +     if (!atype)
> > +             return -E2BIG;
> > +
> > +     list_del_rcu(&atype->atypes);
> > +     atype->btf_id = attach_btf_id;
> > +     atype->usecnt = 1;
> > +     list_add_tail_rcu(&atype->atypes, &used_bpf_lsm_atypes);
> > +
> > +     return CGROUP_LSM_START + atype->index;
> > +}
> > +
> > +static void bpf_lsm_attach_type_reclaim(struct rcu_head *head)
> > +{
> > +     struct bpf_lsm_attach_type *atype =
> > +             container_of(head, struct bpf_lsm_attach_type, rcu_head);
> > +
> > +     atype->btf_id = 0;
> > +     atype->usecnt = 0;
> > +     list_add_tail_rcu(&atype->atypes, &unused_bpf_lsm_atypes);
> hmm...... no need to hold the cgroup_mutex when changing
> the unused_bpf_lsm_atypes list ?
> but it is a rcu callback, so spinlock is needed.

Oh, good point.

> > +}
> > +
> > +static void bpf_lsm_attach_type_put(u32 attach_btf_id)
> > +{
> > +     struct bpf_lsm_attach_type *atype;
> > +
> > +     lockdep_assert_held(&cgroup_mutex);
> > +
> > +     list_for_each_entry_rcu(atype, &used_bpf_lsm_atypes, atypes) {
> > +             if (atype->btf_id != attach_btf_id)
> > +                     continue;
> > +
> > +             if (--atype->usecnt <= 0) {
> > +                     list_del_rcu(&atype->atypes);
> > +                     WARN_ON_ONCE(atype->usecnt < 0);
> > +
> > +                     /* call_rcu here prevents atype reuse within
> > +                      * the same rcu grace period.
> > +                      * shim programs use __bpf_prog_enter_lsm_cgroup
> > +                      * which starts RCU read section.
> It is a bit unclear for me to think through why
> there is no need to assign 'shim_prog->aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID'
> here before reclaim and the shim_prog->bpf_func does not need to check
> shim_prog->aux->cgroup_atype before using it.
>
> It will be very useful to have a few word comments here to explain this.

My thinking is:
- shim program starts rcu read section (via __bpf_prog_enter_lsm_cgroup)
- on release (bpf_lsm_attach_type_put) we do
list_del_rcu(&atype->atypes) to make sure that particular atype is
"reserved" until grace period and not being reused
- we won't reuse that particular atype for new attachments until grace period
- existing shim programs will still use this atype until grace period,
but we rely on cgroup effective array be empty by that point
- after grace period, we reclaim that atype

Does it clarify your concern? Am I missing something? Not sure how to
put it into a small/concise comment :-)

(maybe moot after your next comment)

> > +                      */
> > +                     call_rcu(&atype->rcu_head, bpf_lsm_attach_type_reclaim);
> How about doing this bpf_lsm_attach_type_put() in bpf_prog_free_deferred().
> And only do it for the shim_prog but not the cgroup-lsm prog.
> The shim_prog is the only one needing cgroup_atype.  Then the cgroup_atype
> naturally can be reused when the shim_prog is being destroyed.
>
> bpf_prog_free_deferred has already gone through a rcu grace
> period (__bpf_prog_put_rcu) and it can block, so cgroup_mutex
> can be used.
>
> The need for the rcu_head here should go away also.  The v6 array approach
> could be reconsidered.
>
> The cgroup-lsm prog does not necessarily need to hold a usecnt to the cgroup_atype.
> Their aux->cgroup_atype can be CGROUP_BPF_ATTACH_TYPE_INVALID.
> My understanding is the replace_prog->aux->cgroup_atype during attach
> is an optimization, it can always search again.

I've considered using bpf_prog_free (I think Alexei also suggested
it?), but not ended up using it because of the situation where the
program can be attached, then detached but not actually freed (there
is a link or an fd holding it); in this case we'll be blocking that
atype reuse. But not sure if it's a real problem?

Let me try to see if it works again, your suggestions make sense.
(especially the part about cgroup_atype for shim only, I don't like
all these replace_prog->aux->cgroup_atype = atype in weird places)

Thank you for the review!