Re: [RFC PATCH bpf-next 3/5] libbpf: usdt lib wiring of xmm reads

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Mon, 16 May 2022 16:26:08 -0700

On Thu, May 12, 2022 at 12:43 AM Dave Marchevsky <davemarchevsky@xxxxxx> wrote:
>
> Handle xmm0,...,xmm15 registers when parsing USDT arguments. Currently
> only the first 64 bits of the fetched value are returned as I haven't
> seen the rest of the register used in practice.
>
> This patch also handles floats in USDT arg spec by ignoring the fact
> that they're floats and considering them scalar. Currently we can't do
> float math in BPF programs anyways, so might as well support passing to
> userspace and converting there.
>
> We can use existing ARG_REG sscanf + logic, adding XMM-specific logic
> when calc_pt_regs_off fails. If the reg is xmm, arg_spec's reg_off is
> repurposed to hold reg_no, which is passed to bpf_get_reg_val. Since the
> helper does the digging around in fxregs_state it's not necessary to
> calculate offset in bpf code for these regs.
>
> NOTE: Changes here cause verification failure for existing USDT tests.
> Specifically, BPF_USDT prog 'usdt12' fails to verify due to too many
> insns despite not having its insn count significantly changed.
>
> Signed-off-by: Dave Marchevsky <davemarchevsky@xxxxxx>
> ---
>  tools/lib/bpf/usdt.bpf.h | 36 ++++++++++++++++++++--------
>  tools/lib/bpf/usdt.c     | 51 ++++++++++++++++++++++++++++++++++++----
>  2 files changed, 73 insertions(+), 14 deletions(-)
>
> diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h
> index 4181fddb3687..7b5ed4cbaa2f 100644
> --- a/tools/lib/bpf/usdt.bpf.h
> +++ b/tools/lib/bpf/usdt.bpf.h
> @@ -43,6 +43,7 @@ enum __bpf_usdt_arg_type {
>         BPF_USDT_ARG_CONST,
>         BPF_USDT_ARG_REG,
>         BPF_USDT_ARG_REG_DEREF,
> +       BPF_USDT_ARG_XMM_REG,
>  };
>
>  struct __bpf_usdt_arg_spec {
> @@ -129,7 +130,9 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res)
>  {
>         struct __bpf_usdt_spec *spec;
>         struct __bpf_usdt_arg_spec *arg_spec;
> -       unsigned long val;
> +       struct pt_regs *btf_regs;
> +       struct task_struct *btf_task;
> +       struct { __u64 a; __u64 unused; } val = {};
>         int err, spec_id;
>
>         *res = 0;
> @@ -151,7 +154,7 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res)
>                 /* Arg is just a constant ("-4@$-9" in USDT arg spec).
>                  * value is recorded in arg_spec->val_off directly.
>                  */
> -               val = arg_spec->val_off;
> +               val.a = arg_spec->val_off;
>                 break;
>         case BPF_USDT_ARG_REG:
>                 /* Arg is in a register (e.g, "8@%rax" in USDT arg spec),
> @@ -159,7 +162,20 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res)
>                  * struct pt_regs. To keep things simple user-space parts
>                  * record offsetof(struct pt_regs, <regname>) in arg_spec->reg_off.
>                  */
> -               err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off);
> +               err = bpf_probe_read_kernel(&val.a, sizeof(val.a), (void *)ctx + arg_spec->reg_off);
> +               if (err)
> +                       return err;
> +               break;
> +       case BPF_USDT_ARG_XMM_REG:

nit: a bit too XMM-specific name here, we probably want to keep it a bit

> +               /* Same as above, but arg is an xmm reg, so can't look
> +                * in pt_regs, need to use special helper.
> +                * reg_off is the regno ("xmm0" -> regno 0, etc)
> +                */
> +               btf_task = bpf_get_current_task_btf();
> +               btf_regs = (struct pt_regs *)bpf_task_pt_regs(btf_task);

I'd like to avoid taking dependency on bpf_get_current_task_btf() for
rare case of XMM register, which makes it impossible to do USDT on
older kernels. It seems like supporting reading registers from current
(and maybe current pt_regs context) should cover a lot of practical
uses.

> +               err = bpf_get_reg_val(&val, sizeof(val),

But regardless of the above, we'll need to use CO-RE to detect support
for this new BPF helper (probably using bpf_core_enum_value_exists()?)
to allow using USDTs on older kernels.

> +                                    ((u64)arg_spec->reg_off + BPF_GETREG_X86_XMM0) << 32,
> +                                    btf_regs, btf_task);
>                 if (err)
>                         return err;
>                 break;
> @@ -171,14 +187,14 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res)
>                  * from pt_regs, then do another user-space probe read to
>                  * fetch argument value itself.
>                  */
> -               err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off);
> +               err = bpf_probe_read_kernel(&val.a, sizeof(val.a), (void *)ctx + arg_spec->reg_off);
>                 if (err)
>                         return err;
> -               err = bpf_probe_read_user(&val, sizeof(val), (void *)val + arg_spec->val_off);
> +               err = bpf_probe_read_user(&val.a, sizeof(val.a), (void *)val.a + arg_spec->val_off);

is the useful value in xmm register normally in lower 64-bits of it?
is it possible to just request reading just the first 64 bits from
bpf_get_reg_val() and avoid this ugly union?

>                 if (err)
>                         return err;
>  #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
> -               val >>= arg_spec->arg_bitshift;
> +               val.a >>= arg_spec->arg_bitshift;
>  #endif
>                 break;
>         default:

[...]

> +static int calc_xmm_regno(const char *reg_name)
> +{
> +       static struct {
> +               const char *name;
> +               __u16 regno;
> +       } xmm_reg_map[] = {
> +               { "xmm0",  0 },
> +               { "xmm1",  1 },
> +               { "xmm2",  2 },
> +               { "xmm3",  3 },
> +               { "xmm4",  4 },
> +               { "xmm5",  5 },
> +               { "xmm6",  6 },
> +               { "xmm7",  7 },
> +#ifdef __x86_64__
> +               { "xmm8",  8 },
> +               { "xmm9",  9 },
> +               { "xmm10",  10 },
> +               { "xmm11",  11 },
> +               { "xmm12",  12 },
> +               { "xmm13",  13 },
> +               { "xmm14",  14 },
> +               { "xmm15",  15 },

no-x86 arches parse this generically with sscanf(), seems like we can
do this simple approach here as well?

> +#endif
> +       };
> +       int i;
> +
> +       for (i = 0; i < ARRAY_SIZE(xmm_reg_map); i++) {
> +               if (strcmp(reg_name, xmm_reg_map[i].name) == 0)
> +                       return xmm_reg_map[i].regno;
> +       }
> +
>         return -ENOENT;
>  }
>

[...]