Re: [PATCH bpf-next v7 04/11] bpf: minimize number of allocated lsm slots per program

Martin KaFai Lau <kafai@xxxxxx> · Mon, 23 May 2022 22:53:41 -0700

On Mon, May 23, 2022 at 07:14:42PM -0700, Stanislav Fomichev wrote:
> On Fri, May 20, 2022 at 11:56 PM Martin KaFai Lau <kafai@xxxxxx> wrote:
> >
> > On Wed, May 18, 2022 at 03:55:24PM -0700, Stanislav Fomichev wrote:
> > > Previous patch adds 1:1 mapping between all 211 LSM hooks
> > > and bpf_cgroup program array. Instead of reserving a slot per
> > > possible hook, reserve 10 slots per cgroup for lsm programs.
> > > Those slots are dynamically allocated on demand and reclaimed.
> > >
> > > struct cgroup_bpf {
> > >       struct bpf_prog_array *    effective[33];        /*     0   264 */
> > >       /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
> > >       struct hlist_head          progs[33];            /*   264   264 */
> > >       /* --- cacheline 8 boundary (512 bytes) was 16 bytes ago --- */
> > >       u8                         flags[33];            /*   528    33 */
> > >
> > >       /* XXX 7 bytes hole, try to pack */
> > >
> > >       struct list_head           storages;             /*   568    16 */
> > >       /* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */
> > >       struct bpf_prog_array *    inactive;             /*   584     8 */
> > >       struct percpu_ref          refcnt;               /*   592    16 */
> > >       struct work_struct         release_work;         /*   608    72 */
> > >
> > >       /* size: 680, cachelines: 11, members: 7 */
> > >       /* sum members: 673, holes: 1, sum holes: 7 */
> > >       /* last cacheline: 40 bytes */
> > > };
> > >
> > > Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx>
> > > ---
> > >  include/linux/bpf-cgroup-defs.h |   3 +-
> > >  include/linux/bpf_lsm.h         |   6 --
> > >  kernel/bpf/bpf_lsm.c            |   5 --
> > >  kernel/bpf/cgroup.c             | 135 +++++++++++++++++++++++++++++---
> > >  4 files changed, 125 insertions(+), 24 deletions(-)
> > >
> > > diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
> > > index d5a70a35dace..359d3f16abea 100644
> > > --- a/include/linux/bpf-cgroup-defs.h
> > > +++ b/include/linux/bpf-cgroup-defs.h
> > > @@ -10,7 +10,8 @@
> > >
> > >  struct bpf_prog_array;
> > >
> > > -#define CGROUP_LSM_NUM 211 /* will be addressed in the next patch */
> > > +/* Maximum number of concurrently attachable per-cgroup LSM hooks. */
> > > +#define CGROUP_LSM_NUM 10
> > >
> > >  enum cgroup_bpf_attach_type {
> > >       CGROUP_BPF_ATTACH_TYPE_INVALID = -1,
> > > diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> > > index 7f0e59f5f9be..613de44aa429 100644
> > > --- a/include/linux/bpf_lsm.h
> > > +++ b/include/linux/bpf_lsm.h
> > > @@ -43,7 +43,6 @@ extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
> > >  void bpf_inode_storage_free(struct inode *inode);
> > >
> > >  int bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func);
> > > -int bpf_lsm_hook_idx(u32 btf_id);
> > >
> > >  #else /* !CONFIG_BPF_LSM */
> > >
> > > @@ -74,11 +73,6 @@ static inline int bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
> > >       return -ENOENT;
> > >  }
> > >
> > > -static inline int bpf_lsm_hook_idx(u32 btf_id)
> > > -{
> > > -     return -EINVAL;
> > > -}
> > > -
> > >  #endif /* CONFIG_BPF_LSM */
> > >
> > >  #endif /* _LINUX_BPF_LSM_H */
> > > diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> > > index 654c23577ad3..96503c3e7a71 100644
> > > --- a/kernel/bpf/bpf_lsm.c
> > > +++ b/kernel/bpf/bpf_lsm.c
> > > @@ -71,11 +71,6 @@ int bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
> > >       return 0;
> > >  }
> > >
> > > -int bpf_lsm_hook_idx(u32 btf_id)
> > > -{
> > > -     return btf_id_set_index(&bpf_lsm_hooks, btf_id);
> > > -}
> > > -
> > >  int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
> > >                       const struct bpf_prog *prog)
> > >  {
> > > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> > > index 2c356a38f4cf..a959cdd22870 100644
> > > --- a/kernel/bpf/cgroup.c
> > > +++ b/kernel/bpf/cgroup.c
> > > @@ -132,15 +132,110 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
> > >  }
> > >
> > >  #ifdef CONFIG_BPF_LSM
> > > +struct list_head unused_bpf_lsm_atypes;
> > > +struct list_head used_bpf_lsm_atypes;
> > > +
> > > +struct bpf_lsm_attach_type {
> > > +     int index;
> > > +     u32 btf_id;
> > > +     int usecnt;
> > > +     struct list_head atypes;
> > > +     struct rcu_head rcu_head;
> > > +};
> > > +
> > > +static int __init bpf_lsm_attach_type_init(void)
> > > +{
> > > +     struct bpf_lsm_attach_type *atype;
> > > +     int i;
> > > +
> > > +     INIT_LIST_HEAD_RCU(&unused_bpf_lsm_atypes);
> > > +     INIT_LIST_HEAD_RCU(&used_bpf_lsm_atypes);
> > > +
> > > +     for (i = 0; i < CGROUP_LSM_NUM; i++) {
> > > +             atype = kzalloc(sizeof(*atype), GFP_KERNEL);
> > > +             if (!atype)
> > > +                     continue;
> > > +
> > > +             atype->index = i;
> > > +             list_add_tail_rcu(&atype->atypes, &unused_bpf_lsm_atypes);
> > > +     }
> > > +
> > > +     return 0;
> > > +}
> > > +late_initcall(bpf_lsm_attach_type_init);
> > > +
> > >  static enum cgroup_bpf_attach_type bpf_lsm_attach_type_get(u32 attach_btf_id)
> > >  {
> > > -     return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id);
> > > +     struct bpf_lsm_attach_type *atype;
> > > +
> > > +     lockdep_assert_held(&cgroup_mutex);
> > > +
> > > +     list_for_each_entry_rcu(atype, &used_bpf_lsm_atypes, atypes) {
> > > +             if (atype->btf_id != attach_btf_id)
> > > +                     continue;
> > > +
> > > +             atype->usecnt++;
> > > +             return CGROUP_LSM_START + atype->index;
> > > +     }
> > > +
> > > +     atype = list_first_or_null_rcu(&unused_bpf_lsm_atypes, struct bpf_lsm_attach_type, atypes);
> > > +     if (!atype)
> > > +             return -E2BIG;
> > > +
> > > +     list_del_rcu(&atype->atypes);
> > > +     atype->btf_id = attach_btf_id;
> > > +     atype->usecnt = 1;
> > > +     list_add_tail_rcu(&atype->atypes, &used_bpf_lsm_atypes);
> > > +
> > > +     return CGROUP_LSM_START + atype->index;
> > > +}
> > > +
> > > +static void bpf_lsm_attach_type_reclaim(struct rcu_head *head)
> > > +{
> > > +     struct bpf_lsm_attach_type *atype =
> > > +             container_of(head, struct bpf_lsm_attach_type, rcu_head);
> > > +
> > > +     atype->btf_id = 0;
> > > +     atype->usecnt = 0;
> > > +     list_add_tail_rcu(&atype->atypes, &unused_bpf_lsm_atypes);
> > hmm...... no need to hold the cgroup_mutex when changing
> > the unused_bpf_lsm_atypes list ?
> > but it is a rcu callback, so spinlock is needed.
> 
> Oh, good point.
> 
> > > +}
> > > +
> > > +static void bpf_lsm_attach_type_put(u32 attach_btf_id)
> > > +{
> > > +     struct bpf_lsm_attach_type *atype;
> > > +
> > > +     lockdep_assert_held(&cgroup_mutex);
> > > +
> > > +     list_for_each_entry_rcu(atype, &used_bpf_lsm_atypes, atypes) {
> > > +             if (atype->btf_id != attach_btf_id)
> > > +                     continue;
> > > +
> > > +             if (--atype->usecnt <= 0) {
> > > +                     list_del_rcu(&atype->atypes);
> > > +                     WARN_ON_ONCE(atype->usecnt < 0);
> > > +
> > > +                     /* call_rcu here prevents atype reuse within
> > > +                      * the same rcu grace period.
> > > +                      * shim programs use __bpf_prog_enter_lsm_cgroup
> > > +                      * which starts RCU read section.
> > It is a bit unclear for me to think through why
> > there is no need to assign 'shim_prog->aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID'
> > here before reclaim and the shim_prog->bpf_func does not need to check
> > shim_prog->aux->cgroup_atype before using it.
> >
> > It will be very useful to have a few word comments here to explain this.
> 
> My thinking is:
> - shim program starts rcu read section (via __bpf_prog_enter_lsm_cgroup)
> - on release (bpf_lsm_attach_type_put) we do
> list_del_rcu(&atype->atypes) to make sure that particular atype is
> "reserved" until grace period and not being reused
> - we won't reuse that particular atype for new attachments until grace period
> - existing shim programs will still use this atype until grace period,
> but we rely on cgroup effective array be empty by that point
> - after grace period, we reclaim that atype
> 
> Does it clarify your concern? Am I missing something? Not sure how to
> put it into a small/concise comment :-)
Make sense.  Thanks for the explanation.

> 
> (maybe moot after your next comment)
> 
> > > +                      */
> > > +                     call_rcu(&atype->rcu_head, bpf_lsm_attach_type_reclaim);
> > How about doing this bpf_lsm_attach_type_put() in bpf_prog_free_deferred().
> > And only do it for the shim_prog but not the cgroup-lsm prog.
> > The shim_prog is the only one needing cgroup_atype.  Then the cgroup_atype
> > naturally can be reused when the shim_prog is being destroyed.
> >
> > bpf_prog_free_deferred has already gone through a rcu grace
> > period (__bpf_prog_put_rcu) and it can block, so cgroup_mutex
> > can be used.
> >
> > The need for the rcu_head here should go away also.  The v6 array approach
> > could be reconsidered.
> >
> > The cgroup-lsm prog does not necessarily need to hold a usecnt to the cgroup_atype.
> > Their aux->cgroup_atype can be CGROUP_BPF_ATTACH_TYPE_INVALID.
> > My understanding is the replace_prog->aux->cgroup_atype during attach
> > is an optimization, it can always search again.
> 
> I've considered using bpf_prog_free (I think Alexei also suggested
> it?), but not ended up using it because of the situation where the
> program can be attached, then detached but not actually freed (there
> is a link or an fd holding it); in this case we'll be blocking that
> atype reuse. But not sure if it's a real problem?
For cgroup-lsm prog loaded but not attached, it should not block the
atype to be reused.  It probably will not as long as the cgroup-lsm
prog is not the one holding the atype's refcnt.

For shim_prog, it only does bpf_prog_free_deferred() when there is
no cgroup-lsm prog attached to it.  That will happen after a rcu
grace period which should be fine.

> Let me try to see if it works again, your suggestions make sense.
> (especially the part about cgroup_atype for shim only, I don't like
> all these replace_prog->aux->cgroup_atype = atype in weird places)
Yep.  Sounds good.  It would be nice if some of the special handling for
BPF_LSM_CGROUP in cgroup.c can be simplified because of this.