> On 7 Dec 2022, at 10:14 AM, Namhyung Kim <namhyung@xxxxxxxxx> wrote: > > On Tue, Dec 06, 2022 at 12:09:51PM -0800, Alexei Starovoitov wrote: >> On Mon, Dec 5, 2022 at 4:28 AM Jiri Olsa <olsajiri@xxxxxxxxx> wrote: >>> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c >>> index 3bbd3f0c810c..d27b7dc77894 100644 >>> --- a/kernel/trace/bpf_trace.c >>> +++ b/kernel/trace/bpf_trace.c >>> @@ -2252,9 +2252,8 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) >>> } >>> >>> static __always_inline >>> -void __bpf_trace_run(struct bpf_prog *prog, u64 *args) >>> +void __bpf_trace_prog_run(struct bpf_prog *prog, u64 *args) >>> { >>> - cant_sleep(); >>> if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { >>> bpf_prog_inc_misses_counter(prog); >>> goto out; >>> @@ -2266,6 +2265,22 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args) >>> this_cpu_dec(*(prog->active)); >>> } >>> >>> +static __always_inline >>> +void __bpf_trace_run(struct bpf_raw_event_data *data, u64 *args) >>> +{ >>> + struct bpf_prog *prog = data->prog; >>> + >>> + cant_sleep(); >>> + if (unlikely(!data->recursion)) >>> + return __bpf_trace_prog_run(prog, args); >>> + >>> + if (unlikely(this_cpu_inc_return(*(data->recursion)))) >>> + goto out; >>> + __bpf_trace_prog_run(prog, args); >>> +out: >>> + this_cpu_dec(*(data->recursion)); >>> +} >> >> This is way too much run-time and memory overhead to address >> this corner case. Pls come up with some other approach. >> Sorry I don't have decent suggestions at the moment. >> For now we can simply disallow attaching to contention_begin. >> > > How about this? It seems to work for me. How about progs that are attached with kprobe? See this one: https://lore.kernel.org/bpf/CACkBjsb3GRw5aiTT=RCUs3H5aum_QN+B0ZqZA=MvjspUP6NFMg@xxxxxxxxxxxxxx/T/#u > > Thanks, > Namhyung > > --- > include/linux/trace_events.h | 14 +++++++ > include/linux/tracepoint-defs.h | 5 +++ > kernel/bpf/syscall.c | 18 ++++++++- > kernel/trace/bpf_trace.c | 65 ++++++++++++++++++++++++++++++--- > 4 files changed, 95 insertions(+), 7 deletions(-) > > diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h > index 20749bd9db71..461468210a77 100644 > --- a/include/linux/trace_events.h > +++ b/include/linux/trace_events.h > @@ -742,6 +742,10 @@ void perf_event_detach_bpf_prog(struct perf_event *event); > int perf_event_query_prog_array(struct perf_event *event, void __user *info); > int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); > int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); > +int bpf_probe_register_norecurse(struct bpf_raw_event_map *btp, struct bpf_prog *prog, > + struct bpf_raw_event_data *data); > +int bpf_probe_unregister_norecurse(struct bpf_raw_event_map *btp, > + struct bpf_raw_event_data *data); > struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); > void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); > int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, > @@ -775,6 +779,16 @@ static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf > { > return -EOPNOTSUPP; > } > +static inline int bpf_probe_register_norecurse(struct bpf_raw_event_map *btp, struct bpf_prog *p, > + struct bpf_raw_event_data *data) > +{ > + return -EOPNOTSUPP; > +} > +static inline int bpf_probe_unregister_norecurse(struct bpf_raw_event_map *btp, > + struct bpf_raw_event_data *data) > +{ > + return -EOPNOTSUPP; > +} > static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) > { > return NULL; > diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h > index e7c2276be33e..e5adfe606888 100644 > --- a/include/linux/tracepoint-defs.h > +++ b/include/linux/tracepoint-defs.h > @@ -53,6 +53,11 @@ struct bpf_raw_event_map { > u32 writable_size; > } __aligned(32); > > +struct bpf_raw_event_data { > + struct bpf_prog *prog; > + int __percpu *active; > +}; > + > /* > * If a tracepoint needs to be called from a header file, it is not > * recommended to call it directly, as tracepoints in header files > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 35972afb6850..a8be9c443306 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -3144,14 +3144,24 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, > struct bpf_raw_tp_link { > struct bpf_link link; > struct bpf_raw_event_map *btp; > + struct bpf_raw_event_data data; > }; > > +static bool needs_recursion_check(struct bpf_raw_event_map *btp) > +{ > + return !strcmp(btp->tp->name, "contention_begin"); > +} > + > static void bpf_raw_tp_link_release(struct bpf_link *link) > { > struct bpf_raw_tp_link *raw_tp = > container_of(link, struct bpf_raw_tp_link, link); > > - bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); > + if (needs_recursion_check(raw_tp->btp)) > + bpf_probe_unregister_norecurse(raw_tp->btp, &raw_tp->data); > + else > + bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); > + > bpf_put_raw_tracepoint(raw_tp->btp); > } > > @@ -3348,7 +3358,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, > goto out_put_btp; > } > > - err = bpf_probe_register(link->btp, prog); > + if (needs_recursion_check(link->btp)) > + err = bpf_probe_register_norecurse(link->btp, prog, &link->data); > + else > + err = bpf_probe_register(link->btp, prog); > + > if (err) { > bpf_link_cleanup(&link_primer); > goto out_put_btp; > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index 3bbd3f0c810c..edbfeff029aa 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -2297,7 +2297,20 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args) > REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ > __bpf_trace_run(prog, args); \ > } \ > - EXPORT_SYMBOL_GPL(bpf_trace_run##x) > + EXPORT_SYMBOL_GPL(bpf_trace_run##x); \ > + \ > + static void bpf_trace_run_norecurse##x(struct bpf_raw_event_data *data, \ > + REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ > + { \ > + u64 args[x]; \ > + if (unlikely(this_cpu_inc_return(*(data->active)) != 1)) \ > + goto out; \ > + REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ > + __bpf_trace_run(data->prog, args); \ > + out: \ > + this_cpu_dec(*(data->active)); \ > + } > + > BPF_TRACE_DEFN_x(1); > BPF_TRACE_DEFN_x(2); > BPF_TRACE_DEFN_x(3); > @@ -2311,7 +2324,23 @@ BPF_TRACE_DEFN_x(10); > BPF_TRACE_DEFN_x(11); > BPF_TRACE_DEFN_x(12); > > -static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) > +static void *bpf_trace_norecurse_funcs[12] = { > + (void *)bpf_trace_run_norecurse1, > + (void *)bpf_trace_run_norecurse2, > + (void *)bpf_trace_run_norecurse3, > + (void *)bpf_trace_run_norecurse4, > + (void *)bpf_trace_run_norecurse5, > + (void *)bpf_trace_run_norecurse6, > + (void *)bpf_trace_run_norecurse7, > + (void *)bpf_trace_run_norecurse8, > + (void *)bpf_trace_run_norecurse9, > + (void *)bpf_trace_run_norecurse10, > + (void *)bpf_trace_run_norecurse11, > + (void *)bpf_trace_run_norecurse12, > +}; > + > +static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog, > + void *func, void *data) > { > struct tracepoint *tp = btp->tp; > > @@ -2325,13 +2354,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * > if (prog->aux->max_tp_access > btp->writable_size) > return -EINVAL; > > - return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, > - prog); > + return tracepoint_probe_register_may_exist(tp, func, data); > } > > int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) > { > - return __bpf_probe_register(btp, prog); > + return __bpf_probe_register(btp, prog, btp->bpf_func, prog); > } > > int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) > @@ -2339,6 +2367,33 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) > return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); > } > > +int bpf_probe_register_norecurse(struct bpf_raw_event_map *btp, struct bpf_prog *prog, > + struct bpf_raw_event_data *data) > +{ > + void *bpf_func; > + > + data->active = alloc_percpu_gfp(int, GFP_KERNEL); > + if (!data->active) > + return -ENOMEM; > + > + data->prog = prog; > + bpf_func = bpf_trace_norecurse_funcs[btp->num_args]; > + return __bpf_probe_register(btp, prog, bpf_func, data); > +} > + > +int bpf_probe_unregister_norecurse(struct bpf_raw_event_map *btp, > + struct bpf_raw_event_data *data) > +{ > + int err; > + void *bpf_func; > + > + bpf_func = bpf_trace_norecurse_funcs[btp->num_args]; > + err = tracepoint_probe_unregister(btp->tp, bpf_func, data); > + free_percpu(data->active); > + > + return err; > +} > + > int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, > u32 *fd_type, const char **buf, > u64 *probe_offset, u64 *probe_addr) > -- > 2.39.0.rc0.267.gcb52ba06e7-goog