Re: [PATCH RFC bpf-next 1/3] uprobe: Add uretprobe syscall to speed up return probe

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Mar 18, 2024 at 2:31 AM Jiri Olsa <jolsa@xxxxxxxxxx> wrote:
>
> Adding uretprobe syscall instead of trap to speed up return probe.
>
> At the moment the uretprobe setup/path is:
>
>   - install entry uprobe
>
>   - when the uprobe is hit overwrite probed function's return address on stack
>     with address of the trampoline that contains breakpoint instruction
>
>   - the breakpoint trap code handles the uretprobe consumers execution and
>     jumps back to original return address
>
> This patch changes the above trampoline's breakpoint instruction to new
> ureprobe syscall call. This syscall does exactly the same job as the trap
> with some extra work:
>
>   - syscall trampoline must save original value for rax/r11/rcx registers
>     on stack - rax is set to syscall number and r11/rcx are changed and
>     used by syscall instruction
>
>   - the syscall code reads the original values of those registers and restore
>     those values in task's pt_regs area
>
> Even with the extra registers handling code the having uretprobes handled
> by syscalls shows speed improvement.
>
>   On Intel (11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz)
>
>   current:
>
>     base           :   15.173 ± 0.052M/s
>     uprobe-nop     :    2.797 ± 0.090M/s
>     uprobe-push    :    2.580 ± 0.007M/s
>     uprobe-ret     :    1.001 ± 0.003M/s
>     uretprobe-nop  :    1.373 ± 0.002M/s
>     uretprobe-push :    1.346 ± 0.002M/s
>     uretprobe-ret  :    0.747 ± 0.001M/s
>
>   with the fix:
>
>     base           :   15.704 ± 0.076M/s
>     uprobe-nop     :    2.841 ± 0.008M/s
>     uprobe-push    :    2.666 ± 0.029M/s
>     uprobe-ret     :    1.037 ± 0.008M/s
>     uretprobe-nop  :    1.718 ± 0.010M/s  < ~25% speed up
>     uretprobe-push :    1.658 ± 0.008M/s  < ~23% speed up
>     uretprobe-ret  :    0.853 ± 0.004M/s  < ~14% speed up
>
>   On Amd (AMD Ryzen 7 5700U)
>
>   current:
>
>     base           :    5.702 ± 0.003M/s
>     uprobe-nop     :    1.505 ± 0.011M/s
>     uprobe-push    :    1.388 ± 0.008M/s
>     uprobe-ret     :    0.825 ± 0.001M/s
>     uretprobe-nop  :    0.782 ± 0.001M/s
>     uretprobe-push :    0.750 ± 0.001M/s
>     uretprobe-ret  :    0.544 ± 0.001M/s
>
>   with the fix:
>
>     base           :    5.669 ± 0.004M/s
>     uprobe-nop     :    1.539 ± 0.001M/s
>     uprobe-push    :    1.385 ± 0.003M/s
>     uprobe-ret     :    0.819 ± 0.001M/s
>     uretprobe-nop  :    0.889 ± 0.001M/s < ~13% speed up
>     uretprobe-push :    0.846 ± 0.001M/s < ~12% speed up
>     uretprobe-ret  :    0.594 ± 0.000M/s <  ~9% speed up
>
> Suggested-by: Andrii Nakryiko <andrii@xxxxxxxxxx>
> Signed-off-by: Jiri Olsa <jolsa@xxxxxxxxxx>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>  arch/x86/kernel/uprobes.c              | 48 ++++++++++++++++++++++++++
>  include/linux/syscalls.h               |  2 ++
>  include/linux/uprobes.h                |  2 ++
>  include/uapi/asm-generic/unistd.h      |  5 ++-
>  kernel/events/uprobes.c                | 18 +++++++---
>  kernel/sys_ni.c                        |  2 ++
>  7 files changed, 73 insertions(+), 5 deletions(-)
>

LGTM, but see some questions and nits below:

Acked-by: Andrii Nakryiko <andrii@xxxxxxxxxx>

> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 7e8d46f4147f..af0a33ab06ee 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -383,6 +383,7 @@
>  459    common  lsm_get_self_attr       sys_lsm_get_self_attr
>  460    common  lsm_set_self_attr       sys_lsm_set_self_attr
>  461    common  lsm_list_modules        sys_lsm_list_modules
> +462    64      uretprobe               sys_uretprobe
>
>  #
>  # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index 6c07f6daaa22..069371e86180 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -12,6 +12,7 @@
>  #include <linux/ptrace.h>
>  #include <linux/uprobes.h>
>  #include <linux/uaccess.h>
> +#include <linux/syscalls.h>
>
>  #include <linux/kdebug.h>
>  #include <asm/processor.h>
> @@ -308,6 +309,53 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
>  }
>
>  #ifdef CONFIG_X86_64
> +
> +asm (
> +       ".pushsection .rodata\n"
> +       ".global uretprobe_syscall_entry\n"
> +       "uretprobe_syscall_entry:\n"
> +       "pushq %rax\n"
> +       "pushq %rcx\n"
> +       "pushq %r11\n"
> +       "movq $462, %rax\n"

nit: is it possible to avoid hardcoding 462 here? Can we use
__NR_uretprobe instead?

> +       "syscall\n"

oh, btw, do we need to save flags register as well or it's handled
somehow? I think according to manual syscall instruction does
something to rflags register. So do we need pushfq before syscall?

> +       ".global uretprobe_syscall_end\n"
> +       "uretprobe_syscall_end:\n"
> +       ".popsection\n"
> +);
> +
> +extern u8 uretprobe_syscall_entry[];
> +extern u8 uretprobe_syscall_end[];
> +
> +void *arch_uprobe_trampoline(unsigned long *psize)
> +{
> +       *psize = uretprobe_syscall_end - uretprobe_syscall_entry;
> +       return uretprobe_syscall_entry;
> +}
> +
> +SYSCALL_DEFINE0(uretprobe)
> +{
> +       struct pt_regs *regs = task_pt_regs(current);
> +       unsigned long sregs[3], err;
> +
> +       /*
> +        * We set rax and syscall itself changes rcx and r11, so the syscall
> +        * trampoline saves their original values on stack. We need to read
> +        * them and set original register values and fix the rsp pointer back.
> +        */
> +       err = copy_from_user((void *) &sregs, (void *) regs->sp, sizeof(sregs));
> +       WARN_ON_ONCE(err);
> +
> +       regs->r11 = sregs[0];
> +       regs->cx = sregs[1];
> +       regs->ax = sregs[2];
> +       regs->orig_ax = -1;
> +       regs->sp += sizeof(sregs);
> +
> +       uprobe_handle_trampoline(regs);

probably worth leaving a comment that uprobe_handle_trampoline() is
rewriting userspace RIP and so syscall "returns" to the original
caller.

> +       return regs->ax;

and this is making sure that caller function gets the correct function
return value, right? It's all a bit magical, so worth leaving a
comment here, IMO.

> +}
> +
>  /*
>   * If arch_uprobe->insn doesn't use rip-relative addressing, return
>   * immediately.  Otherwise, rewrite the instruction so that it accesses
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 77eb9b0e7685..db150794f89d 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -972,6 +972,8 @@ asmlinkage long sys_lsm_list_modules(u64 *ids, size_t *size, u32 flags);
>  /* x86 */
>  asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
>
> +asmlinkage long sys_uretprobe(void);
> +
>  /* pciconfig: alpha, arm, arm64, ia64, sparc */
>  asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
>                                 unsigned long off, unsigned long len,
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index f46e0ca0169c..a490146ad89d 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -138,6 +138,8 @@ extern bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
>  extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
>  extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
>                                          void *src, unsigned long len);
> +extern void uprobe_handle_trampoline(struct pt_regs *regs);
> +extern void *arch_uprobe_trampoline(unsigned long *psize);
>  #else /* !CONFIG_UPROBES */
>  struct uprobes_state {
>  };
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 75f00965ab15..8a747cd1d735 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -842,8 +842,11 @@ __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr)
>  #define __NR_lsm_list_modules 461
>  __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
>
> +#define __NR_uretprobe 462
> +__SYSCALL(__NR_uretprobe, sys_uretprobe)
> +
>  #undef __NR_syscalls
> -#define __NR_syscalls 462
> +#define __NR_syscalls 463
>
>  /*
>   * 32 bit systems traditionally used different
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 929e98c62965..90395b16bde0 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -1474,11 +1474,20 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
>         return ret;
>  }
>
> +void * __weak arch_uprobe_trampoline(unsigned long *psize)
> +{
> +       static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
> +
> +       *psize = UPROBE_SWBP_INSN_SIZE;
> +       return &insn;
> +}
> +
>  static struct xol_area *__create_xol_area(unsigned long vaddr)
>  {
>         struct mm_struct *mm = current->mm;
> -       uprobe_opcode_t insn = UPROBE_SWBP_INSN;
> +       unsigned long insns_size;
>         struct xol_area *area;
> +       void *insns;
>
>         area = kmalloc(sizeof(*area), GFP_KERNEL);
>         if (unlikely(!area))
> @@ -1502,7 +1511,8 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
>         /* Reserve the 1st slot for get_trampoline_vaddr() */
>         set_bit(0, area->bitmap);
>         atomic_set(&area->slot_count, 1);
> -       arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
> +       insns = arch_uprobe_trampoline(&insns_size);
> +       arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size);
>
>         if (!xol_add_vma(mm, area))
>                 return area;
> @@ -2123,7 +2133,7 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri)
>         return ri;
>  }
>
> -static void handle_trampoline(struct pt_regs *regs)
> +void uprobe_handle_trampoline(struct pt_regs *regs)
>  {
>         struct uprobe_task *utask;
>         struct return_instance *ri, *next;
> @@ -2188,7 +2198,7 @@ static void handle_swbp(struct pt_regs *regs)
>
>         bp_vaddr = uprobe_get_swbp_addr(regs);
>         if (bp_vaddr == get_trampoline_vaddr())
> -               return handle_trampoline(regs);
> +               return uprobe_handle_trampoline(regs);
>
>         uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
>         if (!uprobe) {
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index faad00cce269..be6195e0d078 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -391,3 +391,5 @@ COND_SYSCALL(setuid16);
>
>  /* restartable sequence */
>  COND_SYSCALL(rseq);
> +
> +COND_SYSCALL(uretprobe);
> --
> 2.44.0
>





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux