On Thu, 1 Nov 2018 19:35:50 +1100 Aleksa Sarai <cyphar@xxxxxxxxxx> wrote: > Historically, kretprobe has always produced unusable stack traces > (kretprobe_trampoline is the only entry in most cases, because of the > funky stack pointer overwriting). This has caused quite a few annoyances > when using tracing to debug problems[1] -- since return values are only > available with kretprobes but stack traces were only usable for kprobes, > users had to probe both and then manually associate them. > > With the advent of bpf_trace, users would have been able to do this > association in bpf, but this was less than ideal (because > bpf_get_stackid would still produce rubbish and programs that didn't > know better would get silly results). The main usecase for stack traces > (at least with bpf_trace) is for DTrace-style aggregation on stack > traces (both entry and exit). Therefore we cannot simply correct the > stack trace on exit -- we must stash away the stack trace and return the > entry stack trace when it is requested. > > [1]: https://github.com/iovisor/bpftrace/issues/101 > > Cc: Brendan Gregg <bgregg@xxxxxxxxxxx> > Cc: Christian Brauner <christian@xxxxxxxxxx> > Signed-off-by: Aleksa Sarai <cyphar@xxxxxxxxxx> > --- > Documentation/kprobes.txt | 6 +- > include/linux/kprobes.h | 27 +++++ > kernel/events/callchain.c | 8 +- > kernel/kprobes.c | 101 +++++++++++++++++- > kernel/trace/trace.c | 11 +- > .../test.d/kprobe/kretprobe_stacktrace.tc | 25 +++++ > 6 files changed, 173 insertions(+), 5 deletions(-) > create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc > > diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt > index 10f4499e677c..1965585848f4 100644 > --- a/Documentation/kprobes.txt > +++ b/Documentation/kprobes.txt > @@ -597,7 +597,11 @@ address with the trampoline's address, stack backtraces and calls > to __builtin_return_address() will typically yield the trampoline's > address instead of the real return address for kretprobed functions. > (As far as we can tell, __builtin_return_address() is used only > -for instrumentation and error reporting.) > +for instrumentation and error reporting.) However, since return probes > +are used extensively in tracing (where stack backtraces are useful), > +return probes will stash away the stack backtrace during function entry > +so that return probe handlers can use the entry backtrace instead of > +having a trace with just kretprobe_trampoline. > > If the number of times a function is called does not match the number > of times it returns, registering a return probe on that function may > diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h > index e909413e4e38..1a1629544e56 100644 > --- a/include/linux/kprobes.h > +++ b/include/linux/kprobes.h > @@ -40,6 +40,8 @@ > #include <linux/rcupdate.h> > #include <linux/mutex.h> > #include <linux/ftrace.h> > +#include <linux/stacktrace.h> > +#include <linux/perf_event.h> > #include <asm/kprobes.h> > > #ifdef CONFIG_KPROBES > @@ -168,11 +170,18 @@ struct kretprobe { > raw_spinlock_t lock; > }; > > +#define KRETPROBE_TRACE_SIZE 127 > +struct kretprobe_trace { > + int nr_entries; > + unsigned long entries[KRETPROBE_TRACE_SIZE]; > +}; > + > struct kretprobe_instance { > struct hlist_node hlist; > struct kretprobe *rp; > kprobe_opcode_t *ret_addr; > struct task_struct *task; > + struct kretprobe_trace entry; > char data[0]; > }; > > @@ -371,6 +380,12 @@ void unregister_kretprobe(struct kretprobe *rp); > int register_kretprobes(struct kretprobe **rps, int num); > void unregister_kretprobes(struct kretprobe **rps, int num); > > +struct kretprobe_instance *current_kretprobe_instance(void); > +void kretprobe_save_stack_trace(struct kretprobe_instance *ri, > + struct stack_trace *trace); > +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri, > + struct perf_callchain_entry_ctx *ctx); > + > void kprobe_flush_task(struct task_struct *tk); > void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); > > @@ -397,6 +412,18 @@ static inline struct kprobe *kprobe_running(void) > { > return NULL; > } > +static inline struct kretprobe_instance *current_kretprobe_instance(void) > +{ > + return NULL; > +} > +static inline void kretprobe_save_stack_trace(struct kretprobe_instance *ri, > + struct stack_trace *trace) > +{ > +} > +static inline void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri, > + struct perf_callchain_entry_ctx *ctx) > +{ > +} > static inline int register_kprobe(struct kprobe *p) > { > return -ENOSYS; > diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c > index 24a77c34e9ad..98edcd8a6987 100644 > --- a/kernel/events/callchain.c > +++ b/kernel/events/callchain.c > @@ -12,6 +12,7 @@ > #include <linux/perf_event.h> > #include <linux/slab.h> > #include <linux/sched/task_stack.h> > +#include <linux/kprobes.h> > > #include "internal.h" > > @@ -197,9 +198,14 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, > ctx.contexts_maxed = false; > > if (kernel && !user_mode(regs)) { > + struct kretprobe_instance *ri = current_kretprobe_instance(); > + > if (add_mark) > perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); > - perf_callchain_kernel(&ctx, regs); > + if (ri) > + kretprobe_perf_callchain_kernel(ri, &ctx); > + else > + perf_callchain_kernel(&ctx, regs); > } > > if (user) { > diff --git a/kernel/kprobes.c b/kernel/kprobes.c > index 90e98e233647..fca3964d18cd 100644 > --- a/kernel/kprobes.c > +++ b/kernel/kprobes.c > @@ -1206,6 +1206,16 @@ __releases(hlist_lock) > } > NOKPROBE_SYMBOL(kretprobe_table_unlock); > > +static bool kretprobe_hash_is_locked(struct task_struct *tsk) > +{ > + unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); > + raw_spinlock_t *hlist_lock; > + > + hlist_lock = kretprobe_table_lock_ptr(hash); > + return raw_spin_is_locked(hlist_lock); > +} > +NOKPROBE_SYMBOL(kretprobe_hash_is_locked); > + > /* > * This function is called from finish_task_switch when task tk becomes dead, > * so that we can recycle any function-return probe instances associated > @@ -1800,6 +1810,13 @@ unsigned long __weak arch_deref_entry_point(void *entry) > return (unsigned long)entry; > } > > +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs); > + > +static inline bool kprobe_is_retprobe(struct kprobe *kp) > +{ > + return kp->pre_handler == pre_handler_kretprobe; > +} > + > #ifdef CONFIG_KRETPROBES > /* > * This kprobe pre_handler is registered with every kretprobe. When probe > @@ -1826,6 +1843,8 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) > hash = hash_ptr(current, KPROBE_HASH_BITS); > raw_spin_lock_irqsave(&rp->lock, flags); > if (!hlist_empty(&rp->free_instances)) { > + struct stack_trace trace = {}; > + > ri = hlist_entry(rp->free_instances.first, > struct kretprobe_instance, hlist); > hlist_del(&ri->hlist); > @@ -1834,6 +1853,11 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) > ri->rp = rp; > ri->task = current; > > + trace.entries = &ri->entry.entries[0]; > + trace.max_entries = KRETPROBE_TRACE_SIZE; > + save_stack_trace_regs(regs, &trace); > + ri->entry.nr_entries = trace.nr_entries; > + So basically your saving a copy of the stack trace for each probe, for something that may not ever be used? This adds quite a lot of overhead for something not used often. I think the real answer is to fix kretprobes (and I just checked, the return call of function graph tracer stack trace doesn't go past the return_to_handler function either. It's just that a stack trace was never done on the return side). The more I look at this patch, the less I like it. -- Steve > if (rp->entry_handler && rp->entry_handler(ri, regs)) { > raw_spin_lock_irqsave(&rp->lock, flags); > hlist_add_head(&ri->hlist, &rp->free_instances); > @@ -1856,6 +1880,65 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) > } > NOKPROBE_SYMBOL(pre_handler_kretprobe); > > +/* > + * Return the kretprobe_instance associated with the current_kprobe. Calling > + * this is only reasonable from within a kretprobe handler context (otherwise > + * return NULL). > + * > + * Must be called within a kretprobe_hash_lock(current, ...) context. > + */ > +struct kretprobe_instance *current_kretprobe_instance(void) > +{ > + struct kprobe *kp; > + struct kretprobe *rp; > + struct kretprobe_instance *ri; > + struct hlist_head *head; > + unsigned long hash = hash_ptr(current, KPROBE_HASH_BITS); > + > + kp = kprobe_running(); > + if (!kp || !kprobe_is_retprobe(kp)) > + return NULL; > + if (WARN_ON(!kretprobe_hash_is_locked(current))) > + return NULL; > + > + rp = container_of(kp, struct kretprobe, kp); > + head = &kretprobe_inst_table[hash]; > + > + hlist_for_each_entry(ri, head, hlist) { > + if (ri->task == current && ri->rp == rp) > + return ri; > + } > + return NULL; > +} > +EXPORT_SYMBOL_GPL(current_kretprobe_instance); > +NOKPROBE_SYMBOL(current_kretprobe_instance); > + > +void kretprobe_save_stack_trace(struct kretprobe_instance *ri, > + struct stack_trace *trace) > +{ > + int i; > + struct kretprobe_trace *krt = &ri->entry; > + > + for (i = trace->skip; i < krt->nr_entries; i++) { > + if (trace->nr_entries >= trace->max_entries) > + break; > + trace->entries[trace->nr_entries++] = krt->entries[i]; > + } > +} > + > +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri, > + struct perf_callchain_entry_ctx *ctx) > +{ > + int i; > + struct kretprobe_trace *krt = &ri->entry; > + > + for (i = 0; i < krt->nr_entries; i++) { > + if (krt->entries[i] == ULONG_MAX) > + break; > + perf_callchain_store(ctx, (u64) krt->entries[i]); > + } > +} > + > bool __weak arch_kprobe_on_func_entry(unsigned long offset) > { > return !offset; > @@ -2005,6 +2088,22 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) > } > NOKPROBE_SYMBOL(pre_handler_kretprobe); > > +struct kretprobe_instance *current_kretprobe_instance(void) > +{ > + return NULL; > +} > +EXPORT_SYMBOL_GPL(current_kretprobe_instance); > +NOKPROBE_SYMBOL(current_kretprobe_instance); > + > +void kretprobe_save_stack_trace(struct kretprobe_instance *ri, > + struct stack_trace *trace) > +{ > +} > + > +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri, > + struct perf_callchain_entry_ctx *ctx) > +{ > +} > #endif /* CONFIG_KRETPROBES */ > > /* Set the kprobe gone and remove its instruction buffer. */ > @@ -2241,7 +2340,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, > char *kprobe_type; > void *addr = p->addr; > > - if (p->pre_handler == pre_handler_kretprobe) > + if (kprobe_is_retprobe(p)) > kprobe_type = "r"; > else > kprobe_type = "k"; > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c > index bf6f1d70484d..2210d38a4dbf 100644 > --- a/kernel/trace/trace.c > +++ b/kernel/trace/trace.c > @@ -42,6 +42,7 @@ > #include <linux/nmi.h> > #include <linux/fs.h> > #include <linux/trace.h> > +#include <linux/kprobes.h> > #include <linux/sched/clock.h> > #include <linux/sched/rt.h> > > @@ -2590,6 +2591,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, > struct ring_buffer_event *event; > struct stack_entry *entry; > struct stack_trace trace; > + struct kretprobe_instance *ri = current_kretprobe_instance(); > int use_stack; > int size = FTRACE_STACK_ENTRIES; > > @@ -2626,7 +2628,9 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, > trace.entries = this_cpu_ptr(ftrace_stack.calls); > trace.max_entries = FTRACE_STACK_MAX_ENTRIES; > > - if (regs) > + if (ri) > + kretprobe_save_stack_trace(ri, &trace); > + else if (regs) > save_stack_trace_regs(regs, &trace); > else > save_stack_trace(&trace); > @@ -2653,7 +2657,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, > else { > trace.max_entries = FTRACE_STACK_ENTRIES; > trace.entries = entry->caller; > - if (regs) > + > + if (ri) > + kretprobe_save_stack_trace(ri, &trace); > + else if (regs) > save_stack_trace_regs(regs, &trace); > else > save_stack_trace(&trace); > diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc > new file mode 100644 > index 000000000000..03146c6a1a3c > --- /dev/null > +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc > @@ -0,0 +1,25 @@ > +#!/bin/sh > +# SPDX-License-Identifier: GPL-2.0+ > +# description: Kretprobe dynamic event with a stacktrace > + > +[ -f kprobe_events ] || exit_unsupported # this is configurable > + > +echo 0 > events/enable > +echo 1 > options/stacktrace > + > +echo 'r:teststackprobe sched_fork $retval' > kprobe_events > +grep teststackprobe kprobe_events > +test -d events/kprobes/teststackprobe > + > +clear_trace > +echo 1 > events/kprobes/teststackprobe/enable > +( echo "forked") > +echo 0 > events/kprobes/teststackprobe/enable > + > +# Make sure we don't see kretprobe_trampoline and we see _do_fork. > +! grep 'kretprobe' trace > +grep '_do_fork' trace > + > +echo '-:teststackprobe' >> kprobe_events > +clear_trace > +test -d events/kprobes/teststackprobe && exit_fail || exit_pass