On 12/13/19 4:47 PM, Martin KaFai Lau wrote: > This patch allows the kernel's struct ops (i.e. func ptr) to be > implemented in BPF. The first use case in this series is the > "struct tcp_congestion_ops" which will be introduced in a > latter patch. > > This patch introduces a new prog type BPF_PROG_TYPE_STRUCT_OPS. > The BPF_PROG_TYPE_STRUCT_OPS prog is verified against a particular > func ptr of a kernel struct. The attr->attach_btf_id is the btf id > of a kernel struct. The attr->expected_attach_type is the member > "index" of that kernel struct. The first member of a struct starts > with member index 0. That will avoid ambiguity when a kernel struct > has multiple func ptrs with the same func signature. > > For example, a BPF_PROG_TYPE_STRUCT_OPS prog is written > to implement the "init" func ptr of the "struct tcp_congestion_ops". > The attr->attach_btf_id is the btf id of the "struct tcp_congestion_ops" > of the _running_ kernel. The attr->expected_attach_type is 3. > > The ctx of BPF_PROG_TYPE_STRUCT_OPS is an array of u64 args saved > by arch_prepare_bpf_trampoline that will be done in the next > patch when introducing BPF_MAP_TYPE_STRUCT_OPS. > > "struct bpf_struct_ops" is introduced as a common interface for the kernel > struct that supports BPF_PROG_TYPE_STRUCT_OPS prog. The supporting kernel > struct will need to implement an instance of the "struct bpf_struct_ops". > > The supporting kernel struct also needs to implement a bpf_verifier_ops. > During BPF_PROG_LOAD, bpf_struct_ops_find() will find the right > bpf_verifier_ops by searching the attr->attach_btf_id. > > A new "btf_struct_access" is also added to the bpf_verifier_ops such > that the supporting kernel struct can optionally provide its own specific > check on accessing the func arg (e.g. provide limited write access). > > After btf_vmlinux is parsed, the new bpf_struct_ops_init() is called > to initialize some values (e.g. the btf id of the supporting kernel > struct) and it can only be done once the btf_vmlinux is available. > > The R0 checks at BPF_EXIT is excluded for the BPF_PROG_TYPE_STRUCT_OPS prog > if the return type of the prog->aux->attach_func_proto is "void". > > Signed-off-by: Martin KaFai Lau <kafai@xxxxxx> > --- > include/linux/bpf.h | 30 +++++++ > include/linux/bpf_types.h | 4 + > include/linux/btf.h | 34 ++++++++ > include/uapi/linux/bpf.h | 1 + > kernel/bpf/Makefile | 2 +- > kernel/bpf/bpf_struct_ops.c | 124 +++++++++++++++++++++++++++ > kernel/bpf/bpf_struct_ops_types.h | 4 + > kernel/bpf/btf.c | 88 ++++++++++++++------ > kernel/bpf/syscall.c | 17 ++-- > kernel/bpf/verifier.c | 134 +++++++++++++++++++++++------- > 10 files changed, 374 insertions(+), 64 deletions(-) > create mode 100644 kernel/bpf/bpf_struct_ops.c > create mode 100644 kernel/bpf/bpf_struct_ops_types.h > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index d467983e61bb..1f0a5fc8c5ee 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -349,6 +349,10 @@ struct bpf_verifier_ops { > const struct bpf_insn *src, > struct bpf_insn *dst, > struct bpf_prog *prog, u32 *target_size); > + int (*btf_struct_access)(struct bpf_verifier_log *log, > + const struct btf_type *t, int off, int size, > + enum bpf_access_type atype, > + u32 *next_btf_id); > }; > > struct bpf_prog_offload_ops { > @@ -667,6 +671,32 @@ struct bpf_array_aux { > struct work_struct work; > }; > > +struct btf_type; > +struct btf_member; > + > +#define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64 > +struct bpf_struct_ops { > + const struct bpf_verifier_ops *verifier_ops; > + int (*init)(struct btf *_btf_vmlinux); > + int (*check_member)(const struct btf_type *t, > + const struct btf_member *member); > + const struct btf_type *type; > + const char *name; > + struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; > + u32 type_id; > +}; > + > +#if defined(CONFIG_BPF_JIT) > +const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id); > +void bpf_struct_ops_init(struct btf *_btf_vmlinux); > +#else > +static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) > +{ > + return NULL; > +} > +static inline void bpf_struct_ops_init(struct btf *_btf_vmlinux) { } > +#endif > + > struct bpf_array { > struct bpf_map map; > u32 elem_size; [...] > +const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { > +}; > + > +const struct bpf_prog_ops bpf_struct_ops_prog_ops = { > +}; > + > +void bpf_struct_ops_init(struct btf *_btf_vmlinux) > +{ > + const struct btf_member *member; > + struct bpf_struct_ops *st_ops; > + struct bpf_verifier_log log = {}; > + const struct btf_type *t; > + const char *mname; > + s32 type_id; > + u32 i, j; > + > + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { > + st_ops = bpf_struct_ops[i]; > + > + type_id = btf_find_by_name_kind(_btf_vmlinux, st_ops->name, > + BTF_KIND_STRUCT); > + if (type_id < 0) { > + pr_warn("Cannot find struct %s in btf_vmlinux\n", > + st_ops->name); > + continue; > + } > + t = btf_type_by_id(_btf_vmlinux, type_id); > + if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { > + pr_warn("Cannot support #%u members in struct %s\n", > + btf_type_vlen(t), st_ops->name); > + continue; > + } > + > + for_each_member(j, t, member) { > + const struct btf_type *func_proto; > + > + mname = btf_name_by_offset(_btf_vmlinux, > + member->name_off); > + if (!*mname) { > + pr_warn("anon member in struct %s is not supported\n", > + st_ops->name); > + break; > + } > + > + if (btf_member_bitfield_size(t, member)) { > + pr_warn("bit field member %s in struct %s is not supported\n", > + mname, st_ops->name); > + break; > + } > + > + func_proto = btf_type_resolve_func_ptr(_btf_vmlinux, > + member->type, > + NULL); > + if (func_proto && > + btf_distill_func_proto(&log, _btf_vmlinux, > + func_proto, mname, > + &st_ops->func_models[j])) { > + pr_warn("Error in parsing func ptr %s in struct %s\n", > + mname, st_ops->name); > + break; > + } > + } > + > + if (j == btf_type_vlen(t)) { > + if (st_ops->init(_btf_vmlinux)) { is it possible that st_ops->init might be a NULL pointer? > + pr_warn("Error in init bpf_struct_ops %s\n", > + st_ops->name); > + } else { > + st_ops->type_id = type_id; > + st_ops->type = t; > + } > + } > + } > +} > + > +extern struct btf *btf_vmlinux; > + [...] > index 408264c1d55b..4c1eaa1a2965 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -2858,11 +2858,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, > u32 btf_id; > int ret; > > - if (atype != BPF_READ) { > - verbose(env, "only read is supported\n"); > - return -EACCES; > - } > - > if (off < 0) { > verbose(env, > "R%d is ptr_%s invalid negative access: off=%d\n", > @@ -2879,17 +2874,32 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, > return -EACCES; > } > > - ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); > + if (env->ops->btf_struct_access) { > + ret = env->ops->btf_struct_access(&env->log, t, off, size, > + atype, &btf_id); > + } else { > + if (atype != BPF_READ) { > + verbose(env, "only read is supported\n"); > + return -EACCES; > + } > + > + ret = btf_struct_access(&env->log, t, off, size, atype, > + &btf_id); > + } > + > if (ret < 0) > return ret; > > - if (ret == SCALAR_VALUE) { > - mark_reg_unknown(env, regs, value_regno); > - return 0; > + if (atype == BPF_READ) { > + if (ret == SCALAR_VALUE) { > + mark_reg_unknown(env, regs, value_regno); > + return 0; > + } > + mark_reg_known_zero(env, regs, value_regno); > + regs[value_regno].type = PTR_TO_BTF_ID; > + regs[value_regno].btf_id = btf_id; > } > - mark_reg_known_zero(env, regs, value_regno); > - regs[value_regno].type = PTR_TO_BTF_ID; > - regs[value_regno].btf_id = btf_id; > + > return 0; > } > > @@ -6343,8 +6353,30 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) > static int check_return_code(struct bpf_verifier_env *env) > { > struct tnum enforce_attach_type_range = tnum_unknown; > + const struct bpf_prog *prog = env->prog; > struct bpf_reg_state *reg; > struct tnum range = tnum_range(0, 1); > + int err; > + > + /* The struct_ops func-ptr's return type could be "void" */ > + if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS && > + !prog->aux->attach_func_proto->type) > + return 0; > + > + /* eBPF calling convetion is such that R0 is used > + * to return the value from eBPF program. > + * Make sure that it's readable at this time > + * of bpf_exit, which means that program wrote > + * something into it earlier > + */ > + err = check_reg_arg(env, BPF_REG_0, SRC_OP); > + if (err) > + return err; > + > + if (is_pointer_value(env, BPF_REG_0)) { > + verbose(env, "R0 leaks addr as return value\n"); > + return -EACCES; > + } > > switch (env->prog->type) { > case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: > @@ -8010,21 +8042,6 @@ static int do_check(struct bpf_verifier_env *env) > if (err) > return err; > > - /* eBPF calling convetion is such that R0 is used > - * to return the value from eBPF program. > - * Make sure that it's readable at this time > - * of bpf_exit, which means that program wrote > - * something into it earlier > - */ > - err = check_reg_arg(env, BPF_REG_0, SRC_OP); > - if (err) > - return err; > - > - if (is_pointer_value(env, BPF_REG_0)) { > - verbose(env, "R0 leaks addr as return value\n"); > - return -EACCES; > - } > - > err = check_return_code(env); > if (err) > return err; > @@ -8833,12 +8850,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) > convert_ctx_access = bpf_xdp_sock_convert_ctx_access; > break; > case PTR_TO_BTF_ID: > - if (type == BPF_WRITE) { > + if (type == BPF_READ) { > + insn->code = BPF_LDX | BPF_PROBE_MEM | > + BPF_SIZE((insn)->code); > + env->prog->aux->num_exentries++; > + } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) { > verbose(env, "Writes through BTF pointers are not allowed\n"); > return -EINVAL; > } > - insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); > - env->prog->aux->num_exentries++; Do we need to increase num_exentries for BPF_WRITE as well? > continue; > default: > continue; > @@ -9505,6 +9524,58 @@ static void print_verification_stats(struct bpf_verifier_env *env) > env->peak_states, env->longest_mark_read_walk); > } > [...]