On Thu, May 12, 2022 at 12:43:18AM -0700, Dave Marchevsky wrote: > Add a helper which reads the value of specified register into memory. > > Currently, bpf programs only have access to general-purpose registers > via struct pt_regs. Other registers, like SSE regs %xmm0-15, are > inaccessible, which makes some tracing usecases impossible. For example, > User Statically-Defined Tracing (USDT) probes may use SSE registers to > pass their arguments on x86. While this patch adds support for %xmm0-15 > only, the helper is meant to be generic enough to support fetching any > reg. > > A useful "value of register" definition for bpf programs is "value of > register before control transfer to kernel". pt_regs gives us this > currently, so it's the default behavior of the new helper. Fetching the > actual _current_ reg value is possible, though, by passing > BPF_GETREG_F_CURRENT flag as part of input. > > For SSE regs we try to avoid digging around in task's fpu state by first > reading _current_ value, then checking to see if the state of cpu's > floating point regs matches task's view of them. If so, we can just > return _current_ value. > > Further usecases which are straightforward to support, but > unimplemented: > * using the helper to fetch general-purpose register value. > currently-unused pt_regs parameter exists for this reason. > > * fetching rdtsc (w/ BPF_GETREG_F_CURRENT) > > * other architectures. s390 specifically might benefit from similar > fpu reg fetching as USDT library was recently updated to support that > architecture. > > Signed-off-by: Dave Marchevsky <davemarchevsky@xxxxxx> > --- > include/uapi/linux/bpf.h | 40 +++++++++ > kernel/trace/bpf_trace.c | 148 +++++++++++++++++++++++++++++++++ > kernel/trace/bpf_trace.h | 1 + > tools/include/uapi/linux/bpf.h | 40 +++++++++ > 4 files changed, 229 insertions(+) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 444fe6f1cf35..3ef8f683ed9e 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -5154,6 +5154,18 @@ union bpf_attr { > * if not NULL, is a reference which must be released using its > * corresponding release function, or moved into a BPF map before > * program exit. > + * > + * long bpf_get_reg_val(void *dst, u32 size, u64 getreg_spec, struct pt_regs *regs, struct task_struct *tsk) > + * Description > + * Store the value of a SSE register specified by *getreg_spec* > + * into memory region of size *size* specified by *dst*. *getreg_spec* > + * is a combination of BPF_GETREG enum AND BPF_GETREG_F flag e.g. > + * (BPF_GETREG_X86_XMM0 << 32) | BPF_GETREG_F_CURRENT.* > + * Return > + * 0 on success > + * **-ENOENT** if the system architecture does not have requested reg > + * **-EINVAL** if *getreg_spec* is invalid > + * **-EINVAL** if *size* != bytes necessary to store requested reg val > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -5351,6 +5363,7 @@ union bpf_attr { > FN(skb_set_tstamp), \ > FN(ima_file_hash), \ > FN(kptr_xchg), \ > + FN(get_reg_val), \ > /* */ > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > @@ -6318,6 +6331,33 @@ struct bpf_perf_event_value { > __u64 running; > }; > > +/* bpf_get_reg_val register enum */ > +enum { > + BPF_GETREG_X86_XMM0 = 0, > + BPF_GETREG_X86_XMM1, > + BPF_GETREG_X86_XMM2, > + BPF_GETREG_X86_XMM3, > + BPF_GETREG_X86_XMM4, > + BPF_GETREG_X86_XMM5, > + BPF_GETREG_X86_XMM6, > + BPF_GETREG_X86_XMM7, > + BPF_GETREG_X86_XMM8, > + BPF_GETREG_X86_XMM9, > + BPF_GETREG_X86_XMM10, > + BPF_GETREG_X86_XMM11, > + BPF_GETREG_X86_XMM12, > + BPF_GETREG_X86_XMM13, > + BPF_GETREG_X86_XMM14, > + BPF_GETREG_X86_XMM15, > + __MAX_BPF_GETREG, > +}; Can we do BPF_GETREG_X86_XMM plus number instead? Enumerating every possible register will take quite some space in uapi and bpf progs probably won't be using these enum values directly anyway. usdt spec will have something like "xmm5" as a string. > + > +/* bpf_get_reg_val flags */ > +enum { > + BPF_GETREG_F_NONE = 0, > + BPF_GETREG_F_CURRENT = (1U << 0), > +}; > + > enum { > BPF_DEVCG_ACC_MKNOD = (1ULL << 0), > BPF_DEVCG_ACC_READ = (1ULL << 1), > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index f15b826f9899..0de7d6b3af5b 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -28,6 +28,10 @@ > > #include <asm/tlb.h> > > +#ifdef CONFIG_X86 > +#include <asm/fpu/context.h> > +#endif > + > #include "trace_probe.h" > #include "trace.h" > > @@ -1166,6 +1170,148 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { > .arg1_type = ARG_PTR_TO_CTX, > }; > > +#define XMM_REG_SZ 16 > + > +#define __xmm_space_off(regno) \ > + case BPF_GETREG_X86_XMM ## regno: \ > + xmm_space_off = regno * 16; \ > + break; > + > +static long getreg_read_xmm_fxsave(u32 reg, struct task_struct *tsk, > + void *data) > +{ > + struct fxregs_state *fxsave; > + u32 xmm_space_off; > + > + switch (reg) { > + __xmm_space_off(0); > + __xmm_space_off(1); > + __xmm_space_off(2); > + __xmm_space_off(3); > + __xmm_space_off(4); > + __xmm_space_off(5); > + __xmm_space_off(6); > + __xmm_space_off(7); > +#ifdef CONFIG_X86_64 > + __xmm_space_off(8); > + __xmm_space_off(9); > + __xmm_space_off(10); > + __xmm_space_off(11); > + __xmm_space_off(12); > + __xmm_space_off(13); > + __xmm_space_off(14); > + __xmm_space_off(15); > +#endif > + default: > + return -EINVAL; > + } > + > + fxsave = &tsk->thread.fpu.fpstate->regs.fxsave; > + memcpy(data, (void *)&fxsave->xmm_space + xmm_space_off, XMM_REG_SZ); > + return 0; It's all arch specific. This one and majority of other functions should probably go into arch/x86/net/bpf_jit_comp.c? instead of generic code. bpf_trace.c doesn't fit. Try to avoid all ifdef-s. It's a red flag. > +static long bpf_read_sse_reg(u32 reg, u32 flags, struct task_struct *tsk, > + void *data) > +{ > +#ifdef CONFIG_X86 > + unsigned long irq_flags; > + long err; > + > + switch (reg) { > + __bpf_sse_read(0); > + __bpf_sse_read(1); > + __bpf_sse_read(2); > + __bpf_sse_read(3); > + __bpf_sse_read(4); > + __bpf_sse_read(5); > + __bpf_sse_read(6); > + __bpf_sse_read(7); > +#ifdef CONFIG_X86_64 > + __bpf_sse_read(8); > + __bpf_sse_read(9); > + __bpf_sse_read(10); > + __bpf_sse_read(11); > + __bpf_sse_read(12); > + __bpf_sse_read(13); > + __bpf_sse_read(14); > + __bpf_sse_read(15); > +#endif /* CONFIG_X86_64 */ > + default: > + return -EINVAL; > + } > + > + if (flags & BPF_GETREG_F_CURRENT) > + return 0; > + > + if (!fpregs_state_valid(&tsk->thread.fpu, smp_processor_id())) { > + local_irq_save(irq_flags); why disable irqs? > + err = getreg_read_xmm_fxsave(reg, tsk, data); > + local_irq_restore(irq_flags); > + return err; > + } What is the use case to read other task regs?