On Wed, Dec 14, 2022 at 2:04 AM Jiri Olsa <jolsa@xxxxxxxxxx> wrote: > > Both bpf_trace_printk and bpf_trace_vprintk helpers use static buffer > guarded with trace_printk_lock spin lock. > > The spin lock contention causes issues with bpf programs attached to > contention_begin tracepoint [1] [2]. > > Andrii suggested we could get rid of the contention by using trylock, > but we could actually get rid of the spinlock completely by using > percpu buffers the same way as for bin_args in bpf_bprintf_prepare > function. > > Adding 4 per cpu buffers (1k each) which should be enough for all > possible nesting contexts (normal, softirq, irq, nmi) or possible > (yet unlikely) probe within the printk helpers. > > In very unlikely case we'd run out of the nesting levels the printk > will be omitted. > > [1] https://lore.kernel.org/bpf/CACkBjsakT_yWxnSWr4r-0TpPvbKm9-OBmVUhJb7hV3hY8fdCkw@xxxxxxxxxxxxxx/ > [2] https://lore.kernel.org/bpf/CACkBjsaCsTovQHFfkqJKto6S4Z8d02ud1D7MPESrHa1cVNNTrw@xxxxxxxxxxxxxx/ > > Reported-by: Hao Sun <sunhao.th@xxxxxxxxx> > Suggested-by: Andrii Nakryiko <andrii@xxxxxxxxxx> > Signed-off-by: Jiri Olsa <jolsa@xxxxxxxxxx> > --- > v2 changes: > - changed subject [Yonghong] > - added WARN_ON_ONCE to get_printk_buf [Song] > > kernel/trace/bpf_trace.c | 61 +++++++++++++++++++++++++++++++--------- > 1 file changed, 47 insertions(+), 14 deletions(-) > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index 3bbd3f0c810c..a992b5a47fd6 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -369,33 +369,62 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) > return &bpf_probe_write_user_proto; > } > > -static DEFINE_RAW_SPINLOCK(trace_printk_lock); > - > #define MAX_TRACE_PRINTK_VARARGS 3 > #define BPF_TRACE_PRINTK_SIZE 1024 > +#define BPF_TRACE_PRINTK_LEVELS 4 > + > +struct trace_printk_buf { > + char data[BPF_TRACE_PRINTK_LEVELS][BPF_TRACE_PRINTK_SIZE]; > + int level; > +}; > +static DEFINE_PER_CPU(struct trace_printk_buf, printk_buf); > + > +static void put_printk_buf(struct trace_printk_buf __percpu *buf) > +{ > + if (WARN_ON_ONCE(this_cpu_read(buf->level) == 0)) > + return; > + this_cpu_dec(buf->level); > + preempt_enable(); > +} > + > +static bool get_printk_buf(struct trace_printk_buf __percpu *buf, char **data) > +{ > + int level; > + > + preempt_disable(); > + level = this_cpu_inc_return(buf->level); > + if (WARN_ON_ONCE(level > BPF_TRACE_PRINTK_LEVELS)) { > + put_printk_buf(buf); > + return false; > + } > + *data = (char *) this_cpu_ptr(&buf->data[level - 1]); > + return true; > +} > > BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, > u64, arg2, u64, arg3) > { > u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; > u32 *bin_args; > - static char buf[BPF_TRACE_PRINTK_SIZE]; > - unsigned long flags; > + char *buf; > int ret; > > + if (!get_printk_buf(&printk_buf, &buf)) > + return -EBUSY; > + > ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args, > MAX_TRACE_PRINTK_VARARGS); > if (ret < 0) > - return ret; > + goto out; > > - raw_spin_lock_irqsave(&trace_printk_lock, flags); > - ret = bstr_printf(buf, sizeof(buf), fmt, bin_args); > + ret = bstr_printf(buf, BPF_TRACE_PRINTK_SIZE, fmt, bin_args); > > trace_bpf_trace_printk(buf); > - raw_spin_unlock_irqrestore(&trace_printk_lock, flags); > > bpf_bprintf_cleanup(); > > +out: > + put_printk_buf(&printk_buf); > return ret; > } > > @@ -427,31 +456,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) > return &bpf_trace_printk_proto; > } > > +static DEFINE_PER_CPU(struct trace_printk_buf, vprintk_buf); > + > BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, data, > u32, data_len) > { > - static char buf[BPF_TRACE_PRINTK_SIZE]; > - unsigned long flags; > int ret, num_args; > u32 *bin_args; > + char *buf; > > if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || > (data_len && !data)) > return -EINVAL; > num_args = data_len / 8; > > + if (!get_printk_buf(&vprintk_buf, &buf)) > + return -EBUSY; > + > ret = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args); > if (ret < 0) > - return ret; > + goto out; > > - raw_spin_lock_irqsave(&trace_printk_lock, flags); > - ret = bstr_printf(buf, sizeof(buf), fmt, bin_args); > + ret = bstr_printf(buf, BPF_TRACE_PRINTK_SIZE, fmt, bin_args); > > trace_bpf_trace_printk(buf); > - raw_spin_unlock_irqrestore(&trace_printk_lock, flags); > > bpf_bprintf_cleanup(); I checked what this is doing. And it seems like we have a very similar thing there already, with preempt_disable(), 3-level buffers, etc. Would it make sense to roll this new change into bpf_bprintf_prepare() and make it return the buffer for bpf_trace_printk(), even if it is not used for bpf_snprintf() ? > > +out: > + put_printk_buf(&vprintk_buf); > return ret; > } > > -- > 2.38.1 >