On Thu, Oct 22, 2020 at 04:52:29PM -0400, Steven Rostedt wrote: > On Thu, 22 Oct 2020 12:21:50 -0400 > Steven Rostedt <rostedt@xxxxxxxxxxx> wrote: > > > On Thu, 22 Oct 2020 10:42:05 -0400 > > Steven Rostedt <rostedt@xxxxxxxxxxx> wrote: > > > > > I'd like to see how batch functions will work. I guess I need to start > > > looking at the bpf trampoline, to see if we can modify the ftrace > > > trampoline to have a quick access to parameters. It would be much more > > > beneficial to update the existing generic function tracer to have access to > > > function parameters that all users could benefit from, than to tweak a > > > single use case into giving this feature to a single user. > > > > Looking at the creation of the bpf trampoline, I think I can modify ftrace > > to have a more flexible callback. Something that passes the callback the > > following: > > > > the function being traced. > > a pointer to the parent caller (that could be modified) > > a pointer to the original stack frame (what the stack was when the > > function is entered) > > An array of the arguments of the function (which could also be modified) > > > > This is a change I've been wanting to make for some time, because it would > > allow function graph to be a user of function tracer, and would give > > everything access to the arguments. > > > > We would still need a helper function to store all regs to keep kprobes > > working unmodified, but this would still only be done if asked. > > > > The above change shouldn't hurt performance for either ftrace or bpf > > because it appears they both do the same. If BPF wants to have a batch > > processing of functions, then I think we should modify ftrace to do this > > new approach instead of creating another set of function trampolines. > > The below is a quick proof of concept patch I whipped up. It will always > save 6 arguments, so if BPF is really interested in just saving the bare > minimum of arguments before calling, it can still use direct. But if you > are going to have a generic callback, you'll need to save all parameters > otherwise you can corrupt the function's parameter if traced function uses > more than you save. nice, I'll take a look, thanks for quick code ;-) > > Which looking at the bpf trampoline code, I noticed that if the verifier > can't find the btf func, it falls back to saving 5 parameters. Which can be > a bug on x86 if the function itself uses 6 or more. If you only save 5 > parameters, then call a bpf program that calls a helper function that uses > more than 5 parameters, it will likely corrupt the 6th parameter of the > function being traced. number of args from eBPF program to in-kernel function is restricted to 5, so we should be fine > > The code in question is this: > > int btf_distill_func_proto(struct bpf_verifier_log *log, > struct btf *btf, > const struct btf_type *func, > const char *tname, > struct btf_func_model *m) > { > const struct btf_param *args; > const struct btf_type *t; > u32 i, nargs; > int ret; > > if (!func) { > /* BTF function prototype doesn't match the verifier types. > * Fall back to 5 u64 args. > */ > for (i = 0; i < 5; i++) > m->arg_size[i] = 8; > m->ret_size = 8; > m->nr_args = 5; > return 0; > } > > Shouldn't it be falling back to 6, not 5? but looks like this actualy could fallback to 6, jit would allow that, but I'm not sure if there's another restriction thanks, jirka > > -- Steve > > diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c > index 7edbd5ee5ed4..b65d73f430ed 100644 > --- a/arch/x86/kernel/ftrace.c > +++ b/arch/x86/kernel/ftrace.c > @@ -287,6 +287,10 @@ extern void ftrace_caller_end(void); > extern void ftrace_caller_op_ptr(void); > extern void ftrace_regs_caller_op_ptr(void); > extern void ftrace_regs_caller_jmp(void); > +extern void ftrace_args_caller(void); > +extern void ftrace_args_call(void); > +extern void ftrace_args_caller_end(void); > +extern void ftrace_args_caller_op_ptr(void); > > /* movq function_trace_op(%rip), %rdx */ > /* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */ > @@ -317,7 +321,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) > unsigned long end_offset; > unsigned long op_offset; > unsigned long call_offset; > - unsigned long jmp_offset; > + unsigned long jmp_offset = 0; > unsigned long offset; > unsigned long npages; > unsigned long size; > @@ -336,12 +340,16 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) > op_offset = (unsigned long)ftrace_regs_caller_op_ptr; > call_offset = (unsigned long)ftrace_regs_call; > jmp_offset = (unsigned long)ftrace_regs_caller_jmp; > + } else if (ops->flags & FTRACE_OPS_FL_ARGS) { > + start_offset = (unsigned long)ftrace_args_caller; > + end_offset = (unsigned long)ftrace_args_caller_end; > + op_offset = (unsigned long)ftrace_args_caller_op_ptr; > + call_offset = (unsigned long)ftrace_args_call; > } else { > start_offset = (unsigned long)ftrace_caller; > end_offset = (unsigned long)ftrace_caller_end; > op_offset = (unsigned long)ftrace_caller_op_ptr; > call_offset = (unsigned long)ftrace_call; > - jmp_offset = 0; > } > > size = end_offset - start_offset; > diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S > index ac3d5f22fe64..65ca634d0b37 100644 > --- a/arch/x86/kernel/ftrace_64.S > +++ b/arch/x86/kernel/ftrace_64.S > @@ -176,6 +176,58 @@ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) > retq > SYM_FUNC_END(ftrace_epilogue) > > +SYM_FUNC_START(ftrace_args_caller) > +#ifdef CONFIG_FRAME_POINTER > + push %rdp > + movq %rsp %rdp > +# define CALLED_OFFEST (7 * 8) > +# define PARENT_OFFSET (8 * 8) > +#else > +# define CALLED_OFFSET (6 * 8) > +# define PARENT_OFFSET (7 * 8) > +#endif > + /* save args */ > + pushq %r9 > + pushq %r8 > + pushq %rcx > + pushq %rdx > + pushq %rsi > + pushq %rdi > + > + /* > + * Parameters: > + * Called site (function called) > + * Address of parent location > + * pointer to ftrace_ops > + * Location of stack when function was called > + * Array of arguments. > + */ > + movq CALLED_OFFSET(%rsp), %rdi > + leaq PARENT_OFFSET(%rsp), %rsi > +SYM_INNER_LABEL(ftrace_args_caller_op_ptr, SYM_L_GLOBAL) > + /* Load the ftrace_ops into the 3rd parameter */ > + movq function_trace_op(%rip), %rdx > + movq %rsi, %rcx > + leaq 0(%rsp), %r8 > + > +SYM_INNER_LABEL(ftrace_args_call, SYM_L_GLOBAL) > + callq ftrace_stub > + > + popq %rdi > + popq %rsi > + popq %rdx > + popq %rcx > + popq %r8 > + popq %r9 > + > +#ifdef CONFIG_FRAME_POINTER > + popq %rdp > +#endif > + > +SYM_INNER_LABEL(ftrace_args_caller_end, SYM_L_GLOBAL) > + jmp ftrace_epilogue > +SYM_FUNC_END(ftrace_args_caller) > + > SYM_FUNC_START(ftrace_regs_caller) > /* Save the current flags before any operations that can change them */ > pushfq > diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h > index 1bd3a0356ae4..0d077e8d7bb4 100644 > --- a/include/linux/ftrace.h > +++ b/include/linux/ftrace.h > @@ -92,6 +92,17 @@ struct ftrace_ops; > typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, > struct ftrace_ops *op, struct pt_regs *regs); > > +typedef void (*ftrace_args_func_t)(unsigned long ip, unsigned long *parent_ip, > + struct ftrace_ops *op, unsigned long *stack, > + unsigned long *args); > + > +union ftrace_callback { > + ftrace_func_t func; > + ftrace_args_func_t args_func; > +}; > + > +typedef union ftrace_callback ftrace_callback_t; > + > ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); > > /* > @@ -169,6 +180,7 @@ enum { > FTRACE_OPS_FL_TRACE_ARRAY = BIT(15), > FTRACE_OPS_FL_PERMANENT = BIT(16), > FTRACE_OPS_FL_DIRECT = BIT(17), > + FTRACE_OPS_FL_ARGS = BIT(18), > }; > > #ifdef CONFIG_DYNAMIC_FTRACE > @@ -447,9 +459,11 @@ enum { > FTRACE_FL_DISABLED = (1UL << 25), > FTRACE_FL_DIRECT = (1UL << 24), > FTRACE_FL_DIRECT_EN = (1UL << 23), > + FTRACE_FL_ARGS = (1UL << 22), > + FTRACE_FL_ARGS_EN = (1UL << 21), > }; > > -#define FTRACE_REF_MAX_SHIFT 23 > +#define FTRACE_REF_MAX_SHIFT 21 > #define FTRACE_REF_MAX ((1UL << FTRACE_REF_MAX_SHIFT) - 1) > > #define ftrace_rec_count(rec) ((rec)->flags & FTRACE_REF_MAX) > diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c > index 4833b6a82ce7..5632b0809dc0 100644 > --- a/kernel/trace/ftrace.c > +++ b/kernel/trace/ftrace.c > @@ -1721,6 +1721,9 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, > if (ops->flags & FTRACE_OPS_FL_DIRECT) > rec->flags |= FTRACE_FL_DIRECT; > > + else if (ops->flags & FTRACE_OPS_FL_ARGS) > + rec->flags |= FTRACE_FL_ARGS; > + > /* > * If there's only a single callback registered to a > * function, and the ops has a trampoline registered > @@ -1757,6 +1760,10 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, > if (ops->flags & FTRACE_OPS_FL_DIRECT) > rec->flags &= ~FTRACE_FL_DIRECT; > > + /* POC: but we will have more than one */ > + if (ops->flags & FTRACE_OPS_FL_ARGS) > + rec->flags &= ~FTRACE_FL_ARGS; > + > /* > * If the rec had REGS enabled and the ops that is > * being removed had REGS set, then see if there is > @@ -2103,6 +2110,13 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) > !(rec->flags & FTRACE_FL_TRAMP_EN)) > flag |= FTRACE_FL_TRAMP; > > + /* Proof of concept */ > + if (ftrace_rec_count(rec) == 1) { > + if (!(rec->flags & FTRACE_FL_ARGS) != > + !(rec->flags & FTRACE_FL_ARGS_EN)) > + flag |= FTRACE_FL_ARGS; > + } > + > /* > * Direct calls are special, as count matters. > * We must test the record for direct, if the > @@ -2144,6 +2158,17 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) > else > rec->flags &= ~FTRACE_FL_TRAMP_EN; > } > + if (flag & FTRACE_FL_ARGS) { > + if (ftrace_rec_count(rec) == 1) { > + if (rec->flags & FTRACE_FL_ARGS) > + rec->flags |= FTRACE_FL_ARGS_EN; > + else > + rec->flags &= ~FTRACE_FL_ARGS_EN; > + } else { > + rec->flags &= ~FTRACE_FL_ARGS_EN; > + } > + } > + > if (flag & FTRACE_FL_DIRECT) { > /* > * If there's only one user (direct_ops helper) > @@ -2192,7 +2217,8 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) > * and REGS states. The _EN flags must be disabled though. > */ > rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | > - FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN); > + FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN | > + FTRACE_FL_ARGS_EN); > } > > ftrace_bug_type = FTRACE_BUG_NOP; > @@ -3630,7 +3656,8 @@ static int t_show(struct seq_file *m, void *v) > ftrace_rec_count(rec), > rec->flags & FTRACE_FL_REGS ? " R" : " ", > rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ", > - rec->flags & FTRACE_FL_DIRECT ? " D" : " "); > + rec->flags & FTRACE_FL_DIRECT ? " D" : > + rec->flags & FTRACE_FL_ARGS ? " A" : " "); > if (rec->flags & FTRACE_FL_TRAMP_EN) { > ops = ftrace_find_tramp_ops_any(rec); > if (ops) { > diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c > index 2c2126e1871d..a3da84b0e599 100644 > --- a/kernel/trace/trace_functions.c > +++ b/kernel/trace/trace_functions.c > @@ -86,9 +86,48 @@ void ftrace_destroy_function_files(struct trace_array *tr) > ftrace_free_ftrace_ops(tr); > } > > +static void function_args_trace_call(unsigned long ip, > + unsigned long *parent_ip, > + struct ftrace_ops *op, > + unsigned long *stack, > + unsigned long *args) > +{ > + struct trace_array *tr = op->private; > + struct trace_array_cpu *data; > + unsigned long flags; > + int bit; > + int cpu; > + int pc; > + > + if (unlikely(!tr->function_enabled)) > + return; > + > + pc = preempt_count(); > + preempt_disable_notrace(); > + > + bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); > + if (bit < 0) > + goto out; > + > + cpu = smp_processor_id(); > + data = per_cpu_ptr(tr->array_buffer.data, cpu); > + if (!atomic_read(&data->disabled)) { > + local_save_flags(flags); > + trace_function(tr, ip, *parent_ip, flags, pc); > + trace_printk("%pS %lx %lx %lx %lx %lx %lx\n", > + (void *)ip, args[0], args[1], args[2], args[3], > + args[4], args[5]); > + } > + trace_clear_recursion(bit); > + > + out: > + preempt_enable_notrace(); > + > +} > + > static int function_trace_init(struct trace_array *tr) > { > - ftrace_func_t func; > + ftrace_callback_t callback; > > /* > * Instance trace_arrays get their ops allocated > @@ -101,11 +140,14 @@ static int function_trace_init(struct trace_array *tr) > /* Currently only the global instance can do stack tracing */ > if (tr->flags & TRACE_ARRAY_FL_GLOBAL && > func_flags.val & TRACE_FUNC_OPT_STACK) > - func = function_stack_trace_call; > - else > - func = function_trace_call; > + callback.func = function_stack_trace_call; > + else { > + tr->ops->flags |= FTRACE_OPS_FL_ARGS; > + callback.args_func = function_args_trace_call; > + } > +// func = function_trace_call; > > - ftrace_init_array_ops(tr, func); > + ftrace_init_array_ops(tr, callback.func); > > tr->array_buffer.cpu = get_cpu(); > put_cpu(); >