Re: [PATCH bpf-next v2] bpf: Use fake pt_regs when doing bpf syscall tracepoint tracing

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Sep 10, 2024 at 2:40 PM Yonghong Song <yonghong.song@xxxxxxxxx> wrote:
>
> Salvatore Benedetto reported an issue that when doing syscall tracepoint
> tracing the kernel stack is empty. For example, using the following
> command line
>   bpftrace -e 'tracepoint:syscalls:sys_enter_read { print("Kernel Stack\n"); print(kstack()); }'
>   bpftrace -e 'tracepoint:syscalls:sys_exit_read { print("Kernel Stack\n"); print(kstack()); }'
> the output for both commands is
> ===
>   Kernel Stack
> ===
>
> Further analysis shows that pt_regs used for bpf syscall tracepoint
> tracing is from the one constructed during user->kernel transition.
> The call stack looks like
>   perf_syscall_enter+0x88/0x7c0
>   trace_sys_enter+0x41/0x80
>   syscall_trace_enter+0x100/0x160
>   do_syscall_64+0x38/0xf0
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> The ip address stored in pt_regs is from user space hence no kernel
> stack is printed.
>
> To fix the issue, kernel address from pt_regs is required.
> In kernel repo, there are already a few cases like this. For example,
> in kernel/trace/bpf_trace.c, several perf_fetch_caller_regs(fake_regs_ptr)
> instances are used to supply ip address or use ip address to construct
> call stack.
>
> Instead of allocate fake_regs in the stack which may consume
> a lot of bytes, the function perf_trace_buf_alloc() in
> perf_syscall_{enter, exit}() is leveraged to create fake_regs,
> which will be passed to perf_call_bpf_{enter,exit}().
>
> For the above bpftrace script, I got the following output with this patch:
> for tracepoint:syscalls:sys_enter_read
> ===
>   Kernel Stack
>
>         syscall_trace_enter+407
>         syscall_trace_enter+407
>         do_syscall_64+74
>         entry_SYSCALL_64_after_hwframe+75
> ===
> and for tracepoint:syscalls:sys_exit_read
> ===
> Kernel Stack
>
>         syscall_exit_work+185
>         syscall_exit_work+185

this duplication is unfortunate, but we have the same with non-syscall
tracepoint, so it's not really a new "issue"

>         syscall_exit_to_user_mode+305
>         do_syscall_64+118
>         entry_SYSCALL_64_after_hwframe+75
> ===
>
> Reported-by: Salvatore Benedetto <salvabenedetto@xxxxxxxx>
> Suggested-by: Andrii Nakryiko <andrii@xxxxxxxxxx>
> Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx>
> ---
>  kernel/trace/trace_syscalls.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
>

Looks great, thank you!

Acked-by: Andrii Nakryiko <andrii@xxxxxxxxxx>

> diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
> index 9c581d6da843..785733245ead 100644
> --- a/kernel/trace/trace_syscalls.c
> +++ b/kernel/trace/trace_syscalls.c
> @@ -564,6 +564,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
>         BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
>
>         /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
> +       perf_fetch_caller_regs(regs);
>         *(struct pt_regs **)&param = regs;
>         param.syscall_nr = rec->nr;
>         for (i = 0; i < sys_data->nb_args; i++)
> @@ -575,6 +576,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>  {
>         struct syscall_metadata *sys_data;
>         struct syscall_trace_enter *rec;
> +       struct pt_regs *fake_regs;
>         struct hlist_head *head;
>         unsigned long args[6];
>         bool valid_prog_array;
> @@ -602,7 +604,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>         size = ALIGN(size + sizeof(u32), sizeof(u64));
>         size -= sizeof(u32);
>
> -       rec = perf_trace_buf_alloc(size, NULL, &rctx);
> +       rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
>         if (!rec)
>                 return;
>
> @@ -611,7 +613,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>         memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
>
>         if ((valid_prog_array &&
> -            !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
> +            !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
>             hlist_empty(head)) {
>                 perf_swevent_put_recursion_context(rctx);
>                 return;
> @@ -666,6 +668,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
>         } __aligned(8) param;
>
>         /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
> +       perf_fetch_caller_regs(regs);
>         *(struct pt_regs **)&param = regs;
>         param.syscall_nr = rec->nr;
>         param.ret = rec->ret;
> @@ -676,6 +679,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
>  {
>         struct syscall_metadata *sys_data;
>         struct syscall_trace_exit *rec;
> +       struct pt_regs *fake_regs;
>         struct hlist_head *head;
>         bool valid_prog_array;
>         int syscall_nr;
> @@ -701,7 +705,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
>         size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
>         size -= sizeof(u32);
>
> -       rec = perf_trace_buf_alloc(size, NULL, &rctx);
> +       rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
>         if (!rec)
>                 return;
>
> @@ -709,7 +713,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
>         rec->ret = syscall_get_return_value(current, regs);
>
>         if ((valid_prog_array &&
> -            !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
> +            !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
>             hlist_empty(head)) {
>                 perf_swevent_put_recursion_context(rctx);
>                 return;
> --
> 2.43.5
>





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux