On Sat, Dec 03, 2022 at 09:58:34AM -0800, Namhyung Kim wrote: > On Wed, Nov 30, 2022 at 03:29:39PM -0800, Andrii Nakryiko wrote: > > On Fri, Nov 25, 2022 at 1:35 AM Jiri Olsa <olsajiri@xxxxxxxxx> wrote: > > > > > > On Thu, Nov 24, 2022 at 09:17:22AM -0800, Alexei Starovoitov wrote: > > > > On Thu, Nov 24, 2022 at 1:42 AM Jiri Olsa <olsajiri@xxxxxxxxx> wrote: > > > > > > > > > > On Thu, Nov 24, 2022 at 01:41:23AM +0100, Daniel Borkmann wrote: > > > > > > On 11/21/22 10:31 PM, Jiri Olsa wrote: > > > > > > > We hit following issues [1] [2] when we attach bpf program that calls > > > > > > > bpf_trace_printk helper to the contention_begin tracepoint. > > > > > > > > > > > > > > As described in [3] with multiple bpf programs that call bpf_trace_printk > > > > > > > helper attached to the contention_begin might result in exhaustion of > > > > > > > printk buffer or cause a deadlock [2]. > > > > > > > > > > > > > > There's also another possible deadlock when multiple bpf programs attach > > > > > > > to bpf_trace_printk tracepoint and call one of the printk bpf helpers. > > > > > > > > > > > > > > This change denies the attachment of bpf program to contention_begin > > > > > > > and bpf_trace_printk tracepoints if the bpf program calls one of the > > > > > > > printk bpf helpers. > > > > > > > > > > > > > > Adding also verifier check for tb_btf programs, so this can be cought > > > > > > > in program loading time with error message like: > > > > > > > > > > > > > > Can't attach program with bpf_trace_printk#6 helper to contention_begin tracepoint. > > > > > > > > > > > > > > [1] https://lore.kernel.org/bpf/CACkBjsakT_yWxnSWr4r-0TpPvbKm9-OBmVUhJb7hV3hY8fdCkw@xxxxxxxxxxxxxx/ > > > > > > > [2] https://lore.kernel.org/bpf/CACkBjsaCsTovQHFfkqJKto6S4Z8d02ud1D7MPESrHa1cVNNTrw@xxxxxxxxxxxxxx/ > > > > > > > [3] https://lore.kernel.org/bpf/Y2j6ivTwFmA0FtvY@krava/ > > > > > > > > > > > > > > Reported-by: Hao Sun <sunhao.th@xxxxxxxxx> > > > > > > > Suggested-by: Alexei Starovoitov <ast@xxxxxxxxxx> > > > > > > > Signed-off-by: Jiri Olsa <jolsa@xxxxxxxxxx> > > > > > > > --- > > > > > > > include/linux/bpf.h | 1 + > > > > > > > include/linux/bpf_verifier.h | 2 ++ > > > > > > > kernel/bpf/syscall.c | 3 +++ > > > > > > > kernel/bpf/verifier.c | 46 ++++++++++++++++++++++++++++++++++++ > > > > > > > 4 files changed, 52 insertions(+) > > > > > > > > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > > > > > index c9eafa67f2a2..3ccabede0f50 100644 > > > > > > > --- a/include/linux/bpf.h > > > > > > > +++ b/include/linux/bpf.h > > > > > > > @@ -1319,6 +1319,7 @@ struct bpf_prog { > > > > > > > enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ > > > > > > > call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ > > > > > > > call_get_func_ip:1, /* Do we call get_func_ip() */ > > > > > > > + call_printk:1, /* Do we call trace_printk/trace_vprintk */ > > > > > > > tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */ > > > > > > > enum bpf_prog_type type; /* Type of BPF program */ > > > > > > > enum bpf_attach_type expected_attach_type; /* For some prog types */ > > > > > > > diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h > > > > > > > index 545152ac136c..7118c2fda59d 100644 > > > > > > > --- a/include/linux/bpf_verifier.h > > > > > > > +++ b/include/linux/bpf_verifier.h > > > > > > > @@ -618,6 +618,8 @@ bool is_dynptr_type_expected(struct bpf_verifier_env *env, > > > > > > > struct bpf_reg_state *reg, > > > > > > > enum bpf_arg_type arg_type); > > > > > > > +int bpf_check_tp_printk_denylist(const char *name, struct bpf_prog *prog); > > > > > > > + > > > > > > > /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ > > > > > > > static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, > > > > > > > struct btf *btf, u32 btf_id) > > > > > > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > > > > > > > index 35972afb6850..9a69bda7d62b 100644 > > > > > > > --- a/kernel/bpf/syscall.c > > > > > > > +++ b/kernel/bpf/syscall.c > > > > > > > @@ -3329,6 +3329,9 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, > > > > > > > return -EINVAL; > > > > > > > } > > > > > > > + if (bpf_check_tp_printk_denylist(tp_name, prog)) > > > > > > > + return -EACCES; > > > > > > > + > > > > > > > btp = bpf_get_raw_tracepoint(tp_name); > > > > > > > if (!btp) > > > > > > > return -ENOENT; > > > > > > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > > > > > > > index f07bec227fef..b662bc851e1c 100644 > > > > > > > --- a/kernel/bpf/verifier.c > > > > > > > +++ b/kernel/bpf/verifier.c > > > > > > > @@ -7472,6 +7472,47 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno > > > > > > > state->callback_subprogno == subprogno); > > > > > > > } > > > > > > > +int bpf_check_tp_printk_denylist(const char *name, struct bpf_prog *prog) > > > > > > > +{ > > > > > > > + static const char * const denylist[] = { > > > > > > > + "contention_begin", > > > > > > > + "bpf_trace_printk", > > > > > > > + }; > > > > > > > + int i; > > > > > > > + > > > > > > > + /* Do not allow attachment to denylist[] tracepoints, > > > > > > > + * if the program calls some of the printk helpers, > > > > > > > + * because there's possibility of deadlock. > > > > > > > + */ > > > > > > > > > > > > What if that prog doesn't but tail calls into another one which calls printk helpers? > > > > > > > > > > right, I'll deny that for all BPF_PROG_TYPE_RAW_TRACEPOINT* programs, > > > > > because I don't see easy way to check on that > > > > > > > > > > we can leave printk check for tracing BPF_TRACE_RAW_TP programs, > > > > > because verifier known the exact tracepoint already > > > > > > > > This is all fragile and merely a stop gap. > > > > Doesn't sound that the issue is limited to bpf_trace_printk > > > > > > hm, I don't have a better idea how to fix that.. I can't deny > > > contention_begin completely, because we use it in perf via > > > tp_btf/contention_begin (perf lock contention) and I don't > > > think there's another way for perf to do that > > > > > > fwiw the last version below denies BPF_PROG_TYPE_RAW_TRACEPOINT > > > programs completely and tracing BPF_TRACE_RAW_TP with printks > > > > > > > I think disabling bpf_trace_printk() tracepoint for any BPF program is > > totally fine. This tracepoint was never intended to be attached to. > > > > But as for the general bpf_trace_printk() deadlocking. Should we > > discuss how to make it not deadlock instead of starting to denylist > > things left and right? > > > > Do I understand that we take trace_printk_lock only to protect that > > static char buf[]? Can we just make this buf per-CPU and do a trylock > > instead? We'll only fail to bpf_trace_printk() something if we have > > nested BPF programs (rare) or NMI (also rare). > > > > And it's a printk(), it's never mission-critical, so if we drop some > > message in rare case it's totally fine. > > What about contention_begin? I wonder if we can disallow recursions > for those in the deny list like using bpf_prog_active.. I was testing change below which allows to check recursion just for contention_begin tracepoint for the reported issue we might be ok with the change that Andrii suggested, but we could have the change below as extra precaution jirka --- diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 20749bd9db71..1c89d4292374 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -740,8 +740,8 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_event_data *data); +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_event_data *data); struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, @@ -873,31 +873,31 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_free_bpf_prog(struct perf_event *event); -void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); -void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); -void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run1(struct bpf_raw_event_data *data, u64 arg1); +void bpf_trace_run2(struct bpf_raw_event_data *data, u64 arg1, u64 arg2); +void bpf_trace_run3(struct bpf_raw_event_data *data, u64 arg1, u64 arg2, u64 arg3); -void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run4(struct bpf_raw_event_data *data, u64 arg1, u64 arg2, u64 arg3, u64 arg4); -void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run5(struct bpf_raw_event_data *data, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5); -void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run6(struct bpf_raw_event_data *data, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6); -void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run7(struct bpf_raw_event_data *data, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); -void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run8(struct bpf_raw_event_data *data, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8); -void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run9(struct bpf_raw_event_data *data, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9); -void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run10(struct bpf_raw_event_data *data, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10); -void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run11(struct bpf_raw_event_data *data, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11); -void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run12(struct bpf_raw_event_data *data, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index e7c2276be33e..5312a8b149c0 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -46,6 +46,11 @@ typedef const int tracepoint_ptr_t; typedef struct tracepoint * const tracepoint_ptr_t; #endif +struct bpf_raw_event_data { + struct bpf_prog *prog; + int __percpu *recursion; +}; + struct bpf_raw_event_map { struct tracepoint *tp; void *bpf_func; diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 6a13220d2d27..a8f9c3c7c447 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -81,8 +81,8 @@ static notrace void \ __bpf_trace_##call(void *__data, proto) \ { \ - struct bpf_prog *prog = __data; \ - CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \ + struct bpf_raw_event_data *____data = __data; \ + CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(____data, CAST_TO_U64(args)); \ } #undef DECLARE_EVENT_CLASS diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35972afb6850..5dcb32cd24e6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3141,9 +3141,36 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, return err; } +static bool needs_recursion_check(struct bpf_raw_event_map *btp) +{ + return !strcmp(btp->tp->name, "contention_begin"); +} + +static int bpf_raw_event_data_init(struct bpf_raw_event_data *data, + struct bpf_raw_event_map *btp, + struct bpf_prog *prog) +{ + int __percpu *recursion = NULL; + + if (needs_recursion_check(btp)) { + recursion = alloc_percpu_gfp(int, GFP_KERNEL); + if (!recursion) + return -ENOMEM; + } + data->recursion = recursion; + data->prog = prog; + return 0; +} + +static void bpf_raw_event_data_release(struct bpf_raw_event_data *data) +{ + free_percpu(data->recursion); +} + struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; + struct bpf_raw_event_data data; }; static void bpf_raw_tp_link_release(struct bpf_link *link) @@ -3151,7 +3178,8 @@ static void bpf_raw_tp_link_release(struct bpf_link *link) struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); - bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); + bpf_probe_unregister(raw_tp->btp, &raw_tp->data); + bpf_raw_event_data_release(&raw_tp->data); bpf_put_raw_tracepoint(raw_tp->btp); } @@ -3338,17 +3366,23 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, err = -ENOMEM; goto out_put_btp; } + if (bpf_raw_event_data_init(&link->data, btp, prog)) { + err = -ENOMEM; + kfree(link); + goto out_put_btp; + } bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, &bpf_raw_tp_link_lops, prog); link->btp = btp; err = bpf_link_prime(&link->link, &link_primer); if (err) { + bpf_raw_event_data_release(&link->data); kfree(link); goto out_put_btp; } - err = bpf_probe_register(link->btp, prog); + err = bpf_probe_register(link->btp, &link->data); if (err) { bpf_link_cleanup(&link_primer); goto out_put_btp; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3bbd3f0c810c..d27b7dc77894 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2252,9 +2252,8 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) } static __always_inline -void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +void __bpf_trace_prog_run(struct bpf_prog *prog, u64 *args) { - cant_sleep(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); goto out; @@ -2266,6 +2265,22 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args) this_cpu_dec(*(prog->active)); } +static __always_inline +void __bpf_trace_run(struct bpf_raw_event_data *data, u64 *args) +{ + struct bpf_prog *prog = data->prog; + + cant_sleep(); + if (unlikely(!data->recursion)) + return __bpf_trace_prog_run(prog, args); + + if (unlikely(this_cpu_inc_return(*(data->recursion)))) + goto out; + __bpf_trace_prog_run(prog, args); +out: + this_cpu_dec(*(data->recursion)); +} + #define UNPACK(...) __VA_ARGS__ #define REPEAT_1(FN, DL, X, ...) FN(X) #define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__) @@ -2290,12 +2305,12 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args) #define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 #define BPF_TRACE_DEFN_x(x) \ - void bpf_trace_run##x(struct bpf_prog *prog, \ + void bpf_trace_run##x(struct bpf_raw_event_data *data, \ REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ { \ u64 args[x]; \ REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ - __bpf_trace_run(prog, args); \ + __bpf_trace_run(data, args); \ } \ EXPORT_SYMBOL_GPL(bpf_trace_run##x) BPF_TRACE_DEFN_x(1); @@ -2311,8 +2326,9 @@ BPF_TRACE_DEFN_x(10); BPF_TRACE_DEFN_x(11); BPF_TRACE_DEFN_x(12); -static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_event_data *data) { + struct bpf_prog *prog = data->prog; struct tracepoint *tp = btp->tp; /* @@ -2326,17 +2342,17 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * return -EINVAL; return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, - prog); + data); } -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_event_data *data) { - return __bpf_probe_register(btp, prog); + return __bpf_probe_register(btp, data); } -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_event_data *data) { - return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); + return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, data); } int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,