As of today, to trigger BPF program from user space, the common practise is to create a uprobe on a special function and calls that function. For example, bpftrace uses BEGIN_trigger and END_trigger for the BEGIN and END programs. However, uprobe is not ideal for this use case. First, uprobe uses trap, which adds non-trivial overhead. Second, uprobe requires calculating function offset at runtime, which is not very reliable. bpftrace has seen issues with this: https://github.com/iovisor/bpftrace/pull/1438 https://github.com/iovisor/bpftrace/issues/1440 This patch introduces a new BPF program type BPF_PROG_TYPE_USER, or "user program". User program is triggered via sys_bpf(BPF_PROG_TEST_RUN), which is significant faster than a trap. To make user program more flexible, we enabled the following features: 1. The user can specify on which cpu the program should run. If the target cpu is not current cpu, the program is triggered via IPI. 2. User can pass optional argument to user program. Currently, the argument can only be 5x u64 numbers. User program has access to helper functions in bpf_tracing_func_proto() and bpf_get_stack|stackid(). Signed-off-by: Song Liu <songliubraving@xxxxxx> --- include/linux/bpf_types.h | 2 + include/uapi/linux/bpf.h | 19 ++++++ kernel/bpf/syscall.c | 3 +- kernel/trace/bpf_trace.c | 121 +++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 19 ++++++ 5 files changed, 163 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a52a5688418e5..3c52f3207aced 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -76,6 +76,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_EXT, bpf_extension, BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, void *, void *) #endif /* CONFIG_BPF_LSM */ +BPF_PROG_TYPE(BPF_PROG_TYPE_USER, user, + void *, void *) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index eb5e0c38eb2cf..f6b9d4e7eeb4e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -190,6 +190,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_USER, }; enum bpf_attach_type { @@ -556,6 +557,12 @@ union bpf_attr { */ __aligned_u64 ctx_in; __aligned_u64 ctx_out; + __u32 cpu_plus; /* run this program on cpu + * (cpu_plus - 1). + * If cpu_plus == 0, run on + * current cpu. Only valid + * for BPF_PROG_TYPE_USER. + */ } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ @@ -4441,4 +4448,16 @@ struct bpf_sk_lookup { __u32 local_port; /* Host byte order */ }; +struct pt_regs; + +#define BPF_USER_PROG_MAX_ARGS 5 +struct bpf_user_prog_args { + __u64 args[BPF_USER_PROG_MAX_ARGS]; +}; + +struct bpf_user_prog_ctx { + struct pt_regs *regs; + __u64 args[BPF_USER_PROG_MAX_ARGS]; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cd3d599e9e90e..f5a28fd8a9bc2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2078,6 +2078,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ case BPF_PROG_TYPE_EXT: /* extends any prog */ + case BPF_PROG_TYPE_USER: return true; default: return false; @@ -2969,7 +2970,7 @@ static int bpf_prog_query(const union bpf_attr *attr, } } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out +#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu_plus static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cb91ef902cc43..cbe789bc1b986 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -16,6 +16,7 @@ #include <linux/error-injection.h> #include <linux/btf_ids.h> +#include <asm/irq_regs.h> #include <asm/tlb.h> #include "trace_probe.h" @@ -1740,6 +1741,126 @@ const struct bpf_verifier_ops perf_event_verifier_ops = { const struct bpf_prog_ops perf_event_prog_ops = { }; +struct bpf_user_prog_test_run_info { + struct bpf_prog *prog; + struct bpf_user_prog_ctx ctx; + u32 retval; +}; + +static void +__bpf_prog_test_run_user(struct bpf_user_prog_test_run_info *info) +{ + rcu_read_lock(); + migrate_disable(); + info->retval = BPF_PROG_RUN(info->prog, &info->ctx); + migrate_enable(); + rcu_read_unlock(); +} + +static void _bpf_prog_test_run_user(void *data) +{ + struct bpf_user_prog_test_run_info *info = data; + + info->ctx.regs = get_irq_regs(); + __bpf_prog_test_run_user(info); +} + +static int bpf_prog_test_run_user(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + void __user *data_in = u64_to_user_ptr(kattr->test.data_in); + __u32 data_size = kattr->test.data_size_in; + struct bpf_user_prog_test_run_info info; + int cpu = kattr->test.cpu_plus - 1; + int err; + + if (kattr->test.ctx_in || kattr->test.ctx_out || + kattr->test.duration || kattr->test.repeat || + kattr->test.data_out) + return -EINVAL; + + if ((data_in && !data_size) || (!data_in && data_size)) + return -EINVAL; + + /* if provided, data_in should be struct bpf_user_prog_args */ + if (data_size > 0 && data_size != sizeof(struct bpf_user_prog_args)) + return -EINVAL; + + if (kattr->test.data_size_in) { + if (copy_from_user(&info.ctx.args, data_in, + sizeof(struct bpf_user_prog_args))) + return -EFAULT; + } else { + memset(&info.ctx.args, 0, sizeof(struct bpf_user_prog_args)); + } + + info.prog = prog; + + if (!kattr->test.cpu_plus || cpu == smp_processor_id()) { + /* non-IPI, use regs from perf_fetch_caller_regs */ + info.ctx.regs = get_bpf_raw_tp_regs(); + if (IS_ERR(info.ctx.regs)) + return PTR_ERR(info.ctx.regs); + perf_fetch_caller_regs(info.ctx.regs); + __bpf_prog_test_run_user(&info); + put_bpf_raw_tp_regs(); + } else { + err = smp_call_function_single(cpu, _bpf_prog_test_run_user, + &info, 1); + if (err) + return err; + } + + if (copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32))) + return -EFAULT; + + return 0; +} + +static bool user_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_u64 = sizeof(u64); + + if (off < 0 || off >= sizeof(struct bpf_user_prog_ctx)) + return false; + + switch (off) { + case bpf_ctx_range(struct bpf_user_prog_ctx, regs): + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) + return false; + break; + default: + break; + } + return true; +} + +static const struct bpf_func_proto * +user_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto; + default: + return bpf_tracing_func_proto(func_id, prog); + } +} + +const struct bpf_verifier_ops user_verifier_ops = { + .get_func_proto = user_prog_func_proto, + .is_valid_access = user_prog_is_valid_access, +}; + +const struct bpf_prog_ops user_prog_ops = { + .test_run = bpf_prog_test_run_user, +}; + static DEFINE_MUTEX(bpf_event_mutex); #define BPF_TRACE_MAX_PROGS 64 diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index eb5e0c38eb2cf..f6b9d4e7eeb4e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -190,6 +190,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_USER, }; enum bpf_attach_type { @@ -556,6 +557,12 @@ union bpf_attr { */ __aligned_u64 ctx_in; __aligned_u64 ctx_out; + __u32 cpu_plus; /* run this program on cpu + * (cpu_plus - 1). + * If cpu_plus == 0, run on + * current cpu. Only valid + * for BPF_PROG_TYPE_USER. + */ } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ @@ -4441,4 +4448,16 @@ struct bpf_sk_lookup { __u32 local_port; /* Host byte order */ }; +struct pt_regs; + +#define BPF_USER_PROG_MAX_ARGS 5 +struct bpf_user_prog_args { + __u64 args[BPF_USER_PROG_MAX_ARGS]; +}; + +struct bpf_user_prog_ctx { + struct pt_regs *regs; + __u64 args[BPF_USER_PROG_MAX_ARGS]; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- 2.24.1