On Sat, Mar 20, 2021 at 10:02:01AM -0700, Yonghong Song wrote: > Jiri Olsa reported a bug ([1]) in kernel where cgroup local > storage pointer may be NULL in bpf_get_local_storage() helper. > There are two issues uncovered by this bug: > (1). kprobe or tracepoint prog incorrectly sets cgroup local storage > before prog run, > (2). due to change from preempt_disable to migrate_disable, > preemption is possible and percpu storage might be overwritten > by other tasks. > > This issue (1) is fixed in [2]. This patch tried to address issue (2). > The following shows how things can go wrong: > task 1: bpf_cgroup_storage_set() for percpu local storage > preemption happens > task 2: bpf_cgroup_storage_set() for percpu local storage > preemption happens > task 1: run bpf program > > task 1 will effectively use the percpu local storage setting by task 2 > which will be either NULL or incorrect ones. > > Instead of just one common local storage per cpu, this patch fixed > the issue by permitting 8 local storages per cpu and each local > storage is identified by a task_struct pointer. This way, we > allow at most 8 nested preemption between bpf_cgroup_storage_set() > and bpf_cgroup_storage_unset(). The percpu local storage slot > is released (calling bpf_cgroup_storage_unset()) by the same task > after bpf program finished running. > bpf_test_run() is also fixed to use the new bpf_cgroup_storage_set() > interface. > > The patch is tested on top of [2] with reproducer in [1]. > Without this patch, kernel will emit error in 2-3 minutes. > With this patch, after one hour, still no error. > > [1] https://lore.kernel.org/bpf/CAKH8qBuXCfUz=w8L+Fj74OaUpbosO29niYwTki7e3Ag044_aww@xxxxxxxxxxxxxx/T > [2] https://lore.kernel.org/bpf/CAKH8qBuXCfUz=w8L+Fj74OaUpbosO29niYwTki7e3Ag044_aww@xxxxxxxxxxxxxx/T > > Cc: Jiri Olsa <jolsa@xxxxxxxxxx> > Cc: Roman Gushchin <guro@xxxxxx> > Signed-off-by: Yonghong Song <yhs@xxxxxx> It looks a bit artificial (8 storages to handle the nesting), but most likely it will work in the real life and the code looks correct to me. Please, feel free to add Acked-by: Roman Gushchin <guro@xxxxxx> and thank you for fixing it! Btw, is it intended for a stable backport? Thanks! > --- > include/linux/bpf-cgroup.h | 57 ++++++++++++++++++++++++++++++++------ > include/linux/bpf.h | 22 ++++++++++++--- > kernel/bpf/helpers.c | 15 +++++++--- > kernel/bpf/local_storage.c | 5 ++-- > net/bpf/test_run.c | 6 +++- > 5 files changed, 86 insertions(+), 19 deletions(-) > > Changelogs: > v2 -> v3: > . merge two patches as bpf_test_run() will have compilation error > and may fail with other changes. > . rewrite bpf_cgroup_storage_set() to be more inline with kernel > coding style. > v1 -> v2: > . fix compilation issues when CONFIG_CGROUPS is off or > CONFIG_CGROUP_BPF is off. > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h > index c42e02b4d84b..6a29fe11485d 100644 > --- a/include/linux/bpf-cgroup.h > +++ b/include/linux/bpf-cgroup.h > @@ -20,14 +20,25 @@ struct bpf_sock_ops_kern; > struct bpf_cgroup_storage; > struct ctl_table; > struct ctl_table_header; > +struct task_struct; > > #ifdef CONFIG_CGROUP_BPF > > extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; > #define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) > > -DECLARE_PER_CPU(struct bpf_cgroup_storage*, > - bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); > +#define BPF_CGROUP_STORAGE_NEST_MAX 8 > + > +struct bpf_cgroup_storage_info { > + struct task_struct *task; > + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; > +}; > + > +/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks > + * to use bpf cgroup storage simultaneously. > + */ > +DECLARE_PER_CPU(struct bpf_cgroup_storage_info, > + bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); > > #define for_each_cgroup_storage_type(stype) \ > for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) > @@ -161,13 +172,42 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type( > return BPF_CGROUP_STORAGE_SHARED; > } > > -static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage > - *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) > +static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage > + *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) > { > enum bpf_cgroup_storage_type stype; > + int i, err = 0; > + > + preempt_disable(); > + for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { > + if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL)) > + continue; > + > + this_cpu_write(bpf_cgroup_storage_info[i].task, current); > + for_each_cgroup_storage_type(stype) > + this_cpu_write(bpf_cgroup_storage_info[i].storage[stype], > + storage[stype]); > + goto out; > + } > + err = -EBUSY; > + WARN_ON_ONCE(1); > + > +out: > + preempt_enable(); > + return err; > +} > + > +static inline void bpf_cgroup_storage_unset(void) > +{ > + int i; > + > + for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { > + if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) > + continue; > > - for_each_cgroup_storage_type(stype) > - this_cpu_write(bpf_cgroup_storage[stype], storage[stype]); > + this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); > + return; > + } > } > > struct bpf_cgroup_storage * > @@ -448,8 +488,9 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, > return -EINVAL; > } > > -static inline void bpf_cgroup_storage_set( > - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {} > +static inline int bpf_cgroup_storage_set( > + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; } > +static inline void bpf_cgroup_storage_unset(void) {} > static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, > struct bpf_map *map) { return 0; } > static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index a47285cd39c2..3a6ae69743ff 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1090,6 +1090,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, > /* BPF program asks to set CN on the packet. */ > #define BPF_RET_SET_CN (1 << 0) > > +/* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY, > + * if bpf_cgroup_storage_set() failed, the rest of programs > + * will not execute. This should be a really rare scenario > + * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of > + * preemptions all between bpf_cgroup_storage_set() and > + * bpf_cgroup_storage_unset() on the same cpu. > + */ > #define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ > ({ \ > struct bpf_prog_array_item *_item; \ > @@ -1102,10 +1109,12 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, > _array = rcu_dereference(array); \ > _item = &_array->items[0]; \ > while ((_prog = READ_ONCE(_item->prog))) { \ > - bpf_cgroup_storage_set(_item->cgroup_storage); \ > + if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ > + break; \ > func_ret = func(_prog, ctx); \ > _ret &= (func_ret & 1); \ > *(ret_flags) |= (func_ret >> 1); \ > + bpf_cgroup_storage_unset(); \ > _item++; \ > } \ > rcu_read_unlock(); \ > @@ -1126,9 +1135,14 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, > goto _out; \ > _item = &_array->items[0]; \ > while ((_prog = READ_ONCE(_item->prog))) { \ > - if (set_cg_storage) \ > - bpf_cgroup_storage_set(_item->cgroup_storage); \ > - _ret &= func(_prog, ctx); \ > + if (!set_cg_storage) { \ > + _ret &= func(_prog, ctx); \ > + } else { \ > + if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ > + break; \ > + _ret &= func(_prog, ctx); \ > + bpf_cgroup_storage_unset(); \ > + } \ > _item++; \ > } \ > _out: \ > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c > index 074800226327..f306611c4ddf 100644 > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -382,8 +382,8 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { > }; > > #ifdef CONFIG_CGROUP_BPF > -DECLARE_PER_CPU(struct bpf_cgroup_storage*, > - bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); > +DECLARE_PER_CPU(struct bpf_cgroup_storage_info, > + bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); > > BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) > { > @@ -392,10 +392,17 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) > * verifier checks that its value is correct. > */ > enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); > - struct bpf_cgroup_storage *storage; > + struct bpf_cgroup_storage *storage = NULL; > void *ptr; > + int i; > > - storage = this_cpu_read(bpf_cgroup_storage[stype]); > + for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { > + if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) > + continue; > + > + storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); > + break; > + } > > if (stype == BPF_CGROUP_STORAGE_SHARED) > ptr = &READ_ONCE(storage->buf)->data[0]; > diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c > index 2d4f9ac12377..bd11db9774c3 100644 > --- a/kernel/bpf/local_storage.c > +++ b/kernel/bpf/local_storage.c > @@ -9,10 +9,11 @@ > #include <linux/slab.h> > #include <uapi/linux/btf.h> > > -DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); > - > #ifdef CONFIG_CGROUP_BPF > > +DEFINE_PER_CPU(struct bpf_cgroup_storage_info, > + bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); > + > #include "../cgroup/cgroup-internal.h" > > #define LOCAL_STORAGE_CREATE_FLAG_MASK \ > diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c > index 0abdd67f44b1..4aabf71cd95d 100644 > --- a/net/bpf/test_run.c > +++ b/net/bpf/test_run.c > @@ -106,12 +106,16 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, > > bpf_test_timer_enter(&t); > do { > - bpf_cgroup_storage_set(storage); > + ret = bpf_cgroup_storage_set(storage); > + if (ret) > + break; > > if (xdp) > *retval = bpf_prog_run_xdp(prog, ctx); > else > *retval = BPF_PROG_RUN(prog, ctx); > + > + bpf_cgroup_storage_unset(); > } while (bpf_test_timer_continue(&t, repeat, &ret, time)); > bpf_test_timer_leave(&t); > > -- > 2.30.2 >