Re: [PATCH v13 03/12] tracing/probes: Add fprobe events for tracing function entry and exit.

Florent Revest <revest@xxxxxxxxxxxx> · Fri, 26 May 2023 21:38:44 +0200

On Fri, May 26, 2023 at 6:18 AM Masami Hiramatsu (Google)
<mhiramat@xxxxxxxxxx> wrote:
>
> [...] Since
> CONFIG_KPROBES_ON_FTRACE requires the CONFIG_DYNAMIC_FTRACE_WITH_REGS,
> it is not available if the architecture only supports
> CONFIG_DYNAMIC_FTRACE_WITH_ARGS. And that means kprobe events can not
> probe function entry/exit effectively on such architecture.
> But this can be solved if the dynamic events supports fprobe events.

Currently CONFIG_FPROBE also requires CONFIG_DYNAMIC_FTRACE_WITH_REGS
so iiuc this will only be true when we'll have migrated fprobe to use
ftrace_regs instead of pt_regs right ?

We discussed having fprobe use ftrace_regs instead of pt_regs in the
past and I even had a proof of concept branch at one point but this
patch seems to make this transition quite a bit harder. Have you tried
to make fprobe work on ftrace_regs on top of this patch ?

> diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
> new file mode 100644
> index 000000000000..48dbbc72b7dd
> --- /dev/null
> +++ b/kernel/trace/trace_fprobe.c
> @@ -0,0 +1,1053 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Fprobe-based tracing events
> + * Copyright (C) 2022 Google LLC.
> + */
> +#define pr_fmt(fmt)    "trace_fprobe: " fmt
> +
> +#include <linux/fprobe.h>
> +#include <linux/module.h>
> +#include <linux/rculist.h>
> +#include <linux/security.h>
> +#include <linux/uaccess.h>
> +
> +#include "trace_dynevent.h"
> +#include "trace_probe.h"
> +#include "trace_probe_kernel.h"
> +#include "trace_probe_tmpl.h"
> +
> +#define FPROBE_EVENT_SYSTEM "fprobes"
> +#define RETHOOK_MAXACTIVE_MAX 4096
> +
> +static int trace_fprobe_create(const char *raw_command);
> +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
> +static int trace_fprobe_release(struct dyn_event *ev);
> +static bool trace_fprobe_is_busy(struct dyn_event *ev);
> +static bool trace_fprobe_match(const char *system, const char *event,
> +                       int argc, const char **argv, struct dyn_event *ev);
> +
> +static struct dyn_event_operations trace_fprobe_ops = {
> +       .create = trace_fprobe_create,
> +       .show = trace_fprobe_show,
> +       .is_busy = trace_fprobe_is_busy,
> +       .free = trace_fprobe_release,
> +       .match = trace_fprobe_match,
> +};
> +
> +/*
> + * Fprobe event core functions
> + */
> +struct trace_fprobe {
> +       struct dyn_event        devent;
> +       struct fprobe           fp;
> +       const char              *symbol;
> +       struct trace_probe      tp;
> +};
> +
> +static bool is_trace_fprobe(struct dyn_event *ev)
> +{
> +       return ev->ops == &trace_fprobe_ops;
> +}
> +
> +static struct trace_fprobe *to_trace_fprobe(struct dyn_event *ev)
> +{
> +       return container_of(ev, struct trace_fprobe, devent);
> +}
> +
> +/**
> + * for_each_trace_fprobe - iterate over the trace_fprobe list
> + * @pos:       the struct trace_fprobe * for each entry
> + * @dpos:      the struct dyn_event * to use as a loop cursor
> + */
> +#define for_each_trace_fprobe(pos, dpos)       \
> +       for_each_dyn_event(dpos)                \
> +               if (is_trace_fprobe(dpos) && (pos = to_trace_fprobe(dpos)))
> +
> +static bool trace_fprobe_is_return(struct trace_fprobe *tf)
> +{
> +       return tf->fp.exit_handler != NULL;
> +}
> +
> +static const char *trace_fprobe_symbol(struct trace_fprobe *tf)
> +{
> +       return tf->symbol ? tf->symbol : "unknown";
> +}
> +
> +static bool trace_fprobe_is_busy(struct dyn_event *ev)
> +{
> +       struct trace_fprobe *tf = to_trace_fprobe(ev);
> +
> +       return trace_probe_is_enabled(&tf->tp);
> +}
> +
> +static bool trace_fprobe_match_command_head(struct trace_fprobe *tf,
> +                                           int argc, const char **argv)
> +{
> +       char buf[MAX_ARGSTR_LEN + 1];
> +
> +       if (!argc)
> +               return true;
> +
> +       snprintf(buf, sizeof(buf), "%s", trace_fprobe_symbol(tf));
> +       if (strcmp(buf, argv[0]))
> +               return false;
> +       argc--; argv++;
> +
> +       return trace_probe_match_command_args(&tf->tp, argc, argv);
> +}
> +
> +static bool trace_fprobe_match(const char *system, const char *event,
> +                       int argc, const char **argv, struct dyn_event *ev)
> +{
> +       struct trace_fprobe *tf = to_trace_fprobe(ev);
> +
> +       if (event[0] != '\0' && strcmp(trace_probe_name(&tf->tp), event))
> +               return false;
> +
> +       if (system && strcmp(trace_probe_group_name(&tf->tp), system))
> +               return false;
> +
> +       return trace_fprobe_match_command_head(tf, argc, argv);
> +}
> +
> +static bool trace_fprobe_is_registered(struct trace_fprobe *tf)
> +{
> +       return fprobe_is_registered(&tf->fp);
> +}
> +
> +/*
> + * Note that we don't verify the fetch_insn code, since it does not come
> + * from user space.
> + */
> +static int
> +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
> +                  void *base)
> +{
> +       struct pt_regs *regs = rec;

I gave it a try this week and it was mostly a matter of replacing
pt_regs with ftrace_regs in this file. Like here for example. Not too
bad so far.

> +       unsigned long val;
> +       int ret;
> +
> +retry:
> +       /* 1st stage: get value from context */
> +       switch (code->op) {
> +       case FETCH_OP_STACK:
> +               val = regs_get_kernel_stack_nth(regs, code->param);

This does not have a ftrace_regs equivalent at the moment. I suppose
we could introduce one without too much effort so that's probably ok.

> +               break;
> +       case FETCH_OP_STACKP:
> +               val = kernel_stack_pointer(regs);
> +               break;
> +       case FETCH_OP_RETVAL:
> +               val = regs_return_value(regs);
> +               break;
> +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
> +       case FETCH_OP_ARG:
> +               val = regs_get_kernel_argument(regs, code->param);
> +               break;
> +#endif
> +       case FETCH_NOP_SYMBOL:  /* Ignore a place holder */
> +               code++;
> +               goto retry;
> +       default:
> +               ret = process_common_fetch_insn(code, &val);
> +               if (ret < 0)
> +                       return ret;
> +       }
> +       code++;
> +
> +       return process_fetch_insn_bottom(code, val, dest, base);
> +}
> +NOKPROBE_SYMBOL(process_fetch_insn)
> +
> +/* function entry handler */
> +static nokprobe_inline void
> +__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
> +                   struct pt_regs *regs,
> +                   struct trace_event_file *trace_file)
> +{
> +       struct fentry_trace_entry_head *entry;
> +       struct trace_event_call *call = trace_probe_event_call(&tf->tp);
> +       struct trace_event_buffer fbuffer;
> +       int dsize;
> +
> +       if (WARN_ON_ONCE(call != trace_file->event_call))
> +               return;
> +
> +       if (trace_trigger_soft_disabled(trace_file))
> +               return;
> +
> +       dsize = __get_data_size(&tf->tp, regs);
> +
> +       entry = trace_event_buffer_reserve(&fbuffer, trace_file,
> +                                          sizeof(*entry) + tf->tp.size + dsize);
> +       if (!entry)
> +               return;
> +
> +       fbuffer.regs = regs;
> +       entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
> +       entry->ip = entry_ip;
> +       store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
> +
> +       trace_event_buffer_commit(&fbuffer);
> +}
> +
> +static void
> +fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
> +                 struct pt_regs *regs)
> +{
> +       struct event_file_link *link;
> +
> +       trace_probe_for_each_link_rcu(link, &tf->tp)
> +               __fentry_trace_func(tf, entry_ip, regs, link->file);
> +}
> +NOKPROBE_SYMBOL(fentry_trace_func);
> +
> +/* Kretprobe handler */
> +static nokprobe_inline void
> +__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
> +                  unsigned long ret_ip, struct pt_regs *regs,
> +                  struct trace_event_file *trace_file)
> +{
> +       struct fexit_trace_entry_head *entry;
> +       struct trace_event_buffer fbuffer;
> +       struct trace_event_call *call = trace_probe_event_call(&tf->tp);
> +       int dsize;
> +
> +       if (WARN_ON_ONCE(call != trace_file->event_call))
> +               return;
> +
> +       if (trace_trigger_soft_disabled(trace_file))
> +               return;
> +
> +       dsize = __get_data_size(&tf->tp, regs);
> +
> +       entry = trace_event_buffer_reserve(&fbuffer, trace_file,
> +                                          sizeof(*entry) + tf->tp.size + dsize);
> +       if (!entry)
> +               return;
> +
> +       fbuffer.regs = regs;
> +       entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
> +       entry->func = entry_ip;
> +       entry->ret_ip = ret_ip;
> +       store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
> +
> +       trace_event_buffer_commit(&fbuffer);
> +}
> +
> +static void
> +fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
> +                unsigned long ret_ip, struct pt_regs *regs)
> +{
> +       struct event_file_link *link;
> +
> +       trace_probe_for_each_link_rcu(link, &tf->tp)
> +               __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file);
> +}
> +NOKPROBE_SYMBOL(fexit_trace_func);
> +
> +#ifdef CONFIG_PERF_EVENTS
> +
> +static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
> +                           struct pt_regs *regs)
> +{
> +       struct trace_event_call *call = trace_probe_event_call(&tf->tp);
> +       struct fentry_trace_entry_head *entry;
> +       struct hlist_head *head;
> +       int size, __size, dsize;
> +       int rctx;
> +
> +       head = this_cpu_ptr(call->perf_events);
> +       if (hlist_empty(head))
> +               return 0;
> +
> +       dsize = __get_data_size(&tf->tp, regs);
> +       __size = sizeof(*entry) + tf->tp.size + dsize;
> +       size = ALIGN(__size + sizeof(u32), sizeof(u64));
> +       size -= sizeof(u32);
> +
> +       entry = perf_trace_buf_alloc(size, NULL, &rctx);
> +       if (!entry)
> +               return 0;
> +
> +       entry->ip = entry_ip;
> +       memset(&entry[1], 0, dsize);
> +       store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
> +       perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,

However, that call concerns me. Perf requires a pt_regs pointer here
and it expects certain specific fields of that pt_regs to be set (the
exact requirements don't seem to be explicitly stated anywhere).

For example, on arm64 (the architecture without
CONFIG_DYNAMIC_FTRACE_WITH_REGS on which I'd like to have fprobe...
:)) perf calls the user_mode(regs) macro which expects the pstate
register to be set in pt_regs. However, pstate is invalid outside of
an exception entry so ftrace_regs on arm64 does not have a pstate
field at all.

If we migrate fprobe to ftrace_regs and try to construct a sparse
pt_regs out of a ftrace_regs here, it wouldn't be enough to just copy
all the registers we know from ftrace_regs into the pt_regs: we would
also need to make up the pstate register with the knowledge that it
has to be set in certain way specifically to please perf... Arch code
wouldn't only have to provide a
"expand_ftrace_regs_into_sparse_pt_regs" macro but also a
"invent_registers_for_perf" similar to the current
perf_arch_fetch_caller_regs macro. This seems rough...

It sounds to me like we should have avoided the use of sparse and
"made up" pt_regs a long time back and the more users of fprobe that
expect pt_regs we add, the more difficult we make it to make fprobe
work on CONFIG_DYNAMIC_FTRACE_WITH_ARGS.

> +                             head, NULL);
> +       return 0;
> +}
> +NOKPROBE_SYMBOL(fentry_perf_func);
> +
> +static void
> +fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
> +               unsigned long ret_ip, struct pt_regs *regs)
> +{
> +       struct trace_event_call *call = trace_probe_event_call(&tf->tp);
> +       struct fexit_trace_entry_head *entry;
> +       struct hlist_head *head;
> +       int size, __size, dsize;
> +       int rctx;
> +
> +       head = this_cpu_ptr(call->perf_events);
> +       if (hlist_empty(head))
> +               return;
> +
> +       dsize = __get_data_size(&tf->tp, regs);
> +       __size = sizeof(*entry) + tf->tp.size + dsize;
> +       size = ALIGN(__size + sizeof(u32), sizeof(u64));
> +       size -= sizeof(u32);
> +
> +       entry = perf_trace_buf_alloc(size, NULL, &rctx);
> +       if (!entry)
> +               return;
> +
> +       entry->func = entry_ip;
> +       entry->ret_ip = ret_ip;
> +       store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
> +       perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
> +                             head, NULL);
> +}
> +NOKPROBE_SYMBOL(fexit_perf_func);
> +#endif /* CONFIG_PERF_EVENTS */
> +
> +static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip,
> +                            unsigned long ret_ip, struct pt_regs *regs,
> +                            void *entry_data)
> +{
> +       struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
> +       int ret = 0;
> +
> +       if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
> +               fentry_trace_func(tf, entry_ip, regs);
> +#ifdef CONFIG_PERF_EVENTS
> +       if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
> +               ret = fentry_perf_func(tf, entry_ip, regs);
> +#endif
> +       return ret;
> +}
> +NOKPROBE_SYMBOL(fentry_dispatcher);
> +
> +static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip,
> +                            unsigned long ret_ip, struct pt_regs *regs,
> +                            void *entry_data)
> +{
> +       struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
> +
> +       if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
> +               fexit_trace_func(tf, entry_ip, ret_ip, regs);
> +#ifdef CONFIG_PERF_EVENTS
> +       if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
> +               fexit_perf_func(tf, entry_ip, ret_ip, regs);
> +#endif
> +}
> +NOKPROBE_SYMBOL(fexit_dispatcher);