On Fri, May 20, 2022 at 11:56 PM Martin KaFai Lau <kafai@xxxxxx> wrote: > > On Wed, May 18, 2022 at 03:55:24PM -0700, Stanislav Fomichev wrote: > > Previous patch adds 1:1 mapping between all 211 LSM hooks > > and bpf_cgroup program array. Instead of reserving a slot per > > possible hook, reserve 10 slots per cgroup for lsm programs. > > Those slots are dynamically allocated on demand and reclaimed. > > > > struct cgroup_bpf { > > struct bpf_prog_array * effective[33]; /* 0 264 */ > > /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */ > > struct hlist_head progs[33]; /* 264 264 */ > > /* --- cacheline 8 boundary (512 bytes) was 16 bytes ago --- */ > > u8 flags[33]; /* 528 33 */ > > > > /* XXX 7 bytes hole, try to pack */ > > > > struct list_head storages; /* 568 16 */ > > /* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */ > > struct bpf_prog_array * inactive; /* 584 8 */ > > struct percpu_ref refcnt; /* 592 16 */ > > struct work_struct release_work; /* 608 72 */ > > > > /* size: 680, cachelines: 11, members: 7 */ > > /* sum members: 673, holes: 1, sum holes: 7 */ > > /* last cacheline: 40 bytes */ > > }; > > > > Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx> > > --- > > include/linux/bpf-cgroup-defs.h | 3 +- > > include/linux/bpf_lsm.h | 6 -- > > kernel/bpf/bpf_lsm.c | 5 -- > > kernel/bpf/cgroup.c | 135 +++++++++++++++++++++++++++++--- > > 4 files changed, 125 insertions(+), 24 deletions(-) > > > > diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h > > index d5a70a35dace..359d3f16abea 100644 > > --- a/include/linux/bpf-cgroup-defs.h > > +++ b/include/linux/bpf-cgroup-defs.h > > @@ -10,7 +10,8 @@ > > > > struct bpf_prog_array; > > > > -#define CGROUP_LSM_NUM 211 /* will be addressed in the next patch */ > > +/* Maximum number of concurrently attachable per-cgroup LSM hooks. */ > > +#define CGROUP_LSM_NUM 10 > > > > enum cgroup_bpf_attach_type { > > CGROUP_BPF_ATTACH_TYPE_INVALID = -1, > > diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h > > index 7f0e59f5f9be..613de44aa429 100644 > > --- a/include/linux/bpf_lsm.h > > +++ b/include/linux/bpf_lsm.h > > @@ -43,7 +43,6 @@ extern const struct bpf_func_proto bpf_inode_storage_delete_proto; > > void bpf_inode_storage_free(struct inode *inode); > > > > int bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func); > > -int bpf_lsm_hook_idx(u32 btf_id); > > > > #else /* !CONFIG_BPF_LSM */ > > > > @@ -74,11 +73,6 @@ static inline int bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, > > return -ENOENT; > > } > > > > -static inline int bpf_lsm_hook_idx(u32 btf_id) > > -{ > > - return -EINVAL; > > -} > > - > > #endif /* CONFIG_BPF_LSM */ > > > > #endif /* _LINUX_BPF_LSM_H */ > > diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c > > index 654c23577ad3..96503c3e7a71 100644 > > --- a/kernel/bpf/bpf_lsm.c > > +++ b/kernel/bpf/bpf_lsm.c > > @@ -71,11 +71,6 @@ int bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, > > return 0; > > } > > > > -int bpf_lsm_hook_idx(u32 btf_id) > > -{ > > - return btf_id_set_index(&bpf_lsm_hooks, btf_id); > > -} > > - > > int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, > > const struct bpf_prog *prog) > > { > > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c > > index 2c356a38f4cf..a959cdd22870 100644 > > --- a/kernel/bpf/cgroup.c > > +++ b/kernel/bpf/cgroup.c > > @@ -132,15 +132,110 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, > > } > > > > #ifdef CONFIG_BPF_LSM > > +struct list_head unused_bpf_lsm_atypes; > > +struct list_head used_bpf_lsm_atypes; > > + > > +struct bpf_lsm_attach_type { > > + int index; > > + u32 btf_id; > > + int usecnt; > > + struct list_head atypes; > > + struct rcu_head rcu_head; > > +}; > > + > > +static int __init bpf_lsm_attach_type_init(void) > > +{ > > + struct bpf_lsm_attach_type *atype; > > + int i; > > + > > + INIT_LIST_HEAD_RCU(&unused_bpf_lsm_atypes); > > + INIT_LIST_HEAD_RCU(&used_bpf_lsm_atypes); > > + > > + for (i = 0; i < CGROUP_LSM_NUM; i++) { > > + atype = kzalloc(sizeof(*atype), GFP_KERNEL); > > + if (!atype) > > + continue; > > + > > + atype->index = i; > > + list_add_tail_rcu(&atype->atypes, &unused_bpf_lsm_atypes); > > + } > > + > > + return 0; > > +} > > +late_initcall(bpf_lsm_attach_type_init); > > + > > static enum cgroup_bpf_attach_type bpf_lsm_attach_type_get(u32 attach_btf_id) > > { > > - return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id); > > + struct bpf_lsm_attach_type *atype; > > + > > + lockdep_assert_held(&cgroup_mutex); > > + > > + list_for_each_entry_rcu(atype, &used_bpf_lsm_atypes, atypes) { > > + if (atype->btf_id != attach_btf_id) > > + continue; > > + > > + atype->usecnt++; > > + return CGROUP_LSM_START + atype->index; > > + } > > + > > + atype = list_first_or_null_rcu(&unused_bpf_lsm_atypes, struct bpf_lsm_attach_type, atypes); > > + if (!atype) > > + return -E2BIG; > > + > > + list_del_rcu(&atype->atypes); > > + atype->btf_id = attach_btf_id; > > + atype->usecnt = 1; > > + list_add_tail_rcu(&atype->atypes, &used_bpf_lsm_atypes); > > + > > + return CGROUP_LSM_START + atype->index; > > +} > > + > > +static void bpf_lsm_attach_type_reclaim(struct rcu_head *head) > > +{ > > + struct bpf_lsm_attach_type *atype = > > + container_of(head, struct bpf_lsm_attach_type, rcu_head); > > + > > + atype->btf_id = 0; > > + atype->usecnt = 0; > > + list_add_tail_rcu(&atype->atypes, &unused_bpf_lsm_atypes); > hmm...... no need to hold the cgroup_mutex when changing > the unused_bpf_lsm_atypes list ? > but it is a rcu callback, so spinlock is needed. Oh, good point. > > +} > > + > > +static void bpf_lsm_attach_type_put(u32 attach_btf_id) > > +{ > > + struct bpf_lsm_attach_type *atype; > > + > > + lockdep_assert_held(&cgroup_mutex); > > + > > + list_for_each_entry_rcu(atype, &used_bpf_lsm_atypes, atypes) { > > + if (atype->btf_id != attach_btf_id) > > + continue; > > + > > + if (--atype->usecnt <= 0) { > > + list_del_rcu(&atype->atypes); > > + WARN_ON_ONCE(atype->usecnt < 0); > > + > > + /* call_rcu here prevents atype reuse within > > + * the same rcu grace period. > > + * shim programs use __bpf_prog_enter_lsm_cgroup > > + * which starts RCU read section. > It is a bit unclear for me to think through why > there is no need to assign 'shim_prog->aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID' > here before reclaim and the shim_prog->bpf_func does not need to check > shim_prog->aux->cgroup_atype before using it. > > It will be very useful to have a few word comments here to explain this. My thinking is: - shim program starts rcu read section (via __bpf_prog_enter_lsm_cgroup) - on release (bpf_lsm_attach_type_put) we do list_del_rcu(&atype->atypes) to make sure that particular atype is "reserved" until grace period and not being reused - we won't reuse that particular atype for new attachments until grace period - existing shim programs will still use this atype until grace period, but we rely on cgroup effective array be empty by that point - after grace period, we reclaim that atype Does it clarify your concern? Am I missing something? Not sure how to put it into a small/concise comment :-) (maybe moot after your next comment) > > + */ > > + call_rcu(&atype->rcu_head, bpf_lsm_attach_type_reclaim); > How about doing this bpf_lsm_attach_type_put() in bpf_prog_free_deferred(). > And only do it for the shim_prog but not the cgroup-lsm prog. > The shim_prog is the only one needing cgroup_atype. Then the cgroup_atype > naturally can be reused when the shim_prog is being destroyed. > > bpf_prog_free_deferred has already gone through a rcu grace > period (__bpf_prog_put_rcu) and it can block, so cgroup_mutex > can be used. > > The need for the rcu_head here should go away also. The v6 array approach > could be reconsidered. > > The cgroup-lsm prog does not necessarily need to hold a usecnt to the cgroup_atype. > Their aux->cgroup_atype can be CGROUP_BPF_ATTACH_TYPE_INVALID. > My understanding is the replace_prog->aux->cgroup_atype during attach > is an optimization, it can always search again. I've considered using bpf_prog_free (I think Alexei also suggested it?), but not ended up using it because of the situation where the program can be attached, then detached but not actually freed (there is a link or an fd holding it); in this case we'll be blocking that atype reuse. But not sure if it's a real problem? Let me try to see if it works again, your suggestions make sense. (especially the part about cgroup_atype for shim only, I don't like all these replace_prog->aux->cgroup_atype = atype in weird places) Thank you for the review!