On Thu, Jul 17, 2014 at 9:19 PM, Alexei Starovoitov <ast@xxxxxxxxxxxx> wrote: > eBPF programs are safe run-to-completion functions with load/unload > methods from userspace similar to kernel modules. > > User space API: > > - load eBPF program > fd = bpf_prog_load(bpf_prog_type, struct nlattr *prog, int len) > > where 'prog' is a sequence of sections (TEXT, LICENSE, MAP_ASSOC) > TEXT - array of eBPF instructions > LICENSE - must be GPL compatible to call helper functions marked gpl_only > MAP_FIXUP - array of {insn idx, map fd} used by kernel to adjust > imm constants in 'mov' instructions used to access maps Nit: naming mismatch between MAP_ASSOC vs MAP_FIXUP. > > - unload eBPF program > close(fd) > > User space example of syscall(__NR_bpf, BPF_PROG_LOAD, prog_type, ...) > follows in later patches > > Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxxxx> > --- > include/linux/bpf.h | 33 +++++ > include/linux/filter.h | 9 +- > include/uapi/linux/bpf.h | 29 +++++ > kernel/bpf/core.c | 5 +- > kernel/bpf/syscall.c | 309 ++++++++++++++++++++++++++++++++++++++++++++++ > net/core/filter.c | 9 +- > 6 files changed, 388 insertions(+), 6 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 91e2caf8edf9..4967619595cc 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -46,4 +46,37 @@ struct bpf_map_type_list { > void bpf_register_map_type(struct bpf_map_type_list *tl); > struct bpf_map *bpf_map_get(u32 map_id); > > +/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs > + * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL > + * instructions after verifying > + */ > +struct bpf_func_proto { > + u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); > + bool gpl_only; > +}; > + > +struct bpf_verifier_ops { > + /* return eBPF function prototype for verification */ > + const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); > +}; > + > +struct bpf_prog_type_list { > + struct list_head list_node; > + struct bpf_verifier_ops *ops; > + enum bpf_prog_type type; > +}; > + > +void bpf_register_prog_type(struct bpf_prog_type_list *tl); > + > +struct bpf_prog_info { > + bool is_gpl_compatible; > + enum bpf_prog_type prog_type; > + struct bpf_verifier_ops *ops; > + u32 *used_maps; > + u32 used_map_cnt; > +}; > + > +void free_bpf_prog_info(struct bpf_prog_info *info); > +struct sk_filter *bpf_prog_get(u32 ufd); > + > #endif /* _LINUX_BPF_H */ > diff --git a/include/linux/filter.h b/include/linux/filter.h > index b43ad6a2b3cf..822b310e75e1 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -30,12 +30,17 @@ struct sock_fprog_kern { > struct sk_buff; > struct sock; > struct seccomp_data; > +struct bpf_prog_info; > > struct sk_filter { > atomic_t refcnt; > u32 jited:1, /* Is our filter JIT'ed? */ > - len:31; /* Number of filter blocks */ > - struct sock_fprog_kern *orig_prog; /* Original BPF program */ > + ebpf:1, /* Is it eBPF program ? */ > + len:30; /* Number of filter blocks */ > + union { > + struct sock_fprog_kern *orig_prog; /* Original BPF program */ > + struct bpf_prog_info *info; > + }; > struct rcu_head rcu; > unsigned int (*bpf_func)(const struct sk_buff *skb, > const struct bpf_insn *filter); > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 3ea11ba053a8..06ba71b49f64 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -333,6 +333,13 @@ enum bpf_cmd { > * returns zero and stores next key or negative error > */ > BPF_MAP_GET_NEXT_KEY, > + > + /* verify and load eBPF program > + * prog_id = bpf_prog_load(bpf_prog_type, struct nlattr *prog, int len) > + * prog is a sequence of sections > + * returns fd or negative error > + */ > + BPF_PROG_LOAD, > }; > > enum bpf_map_attributes { > @@ -350,4 +357,26 @@ enum bpf_map_type { > BPF_MAP_TYPE_HASH, > }; > > +enum bpf_prog_attributes { > + BPF_PROG_UNSPEC, > + BPF_PROG_TEXT, /* array of eBPF instructions */ > + BPF_PROG_LICENSE, /* license string */ > + BPF_PROG_MAP_FIXUP, /* array of {insn idx, map fd} to fixup insns */ > + __BPF_PROG_ATTR_MAX, > +}; > +#define BPF_PROG_ATTR_MAX (__BPF_PROG_ATTR_MAX - 1) > +#define BPF_PROG_MAX_ATTR_SIZE 65535 > + > +enum bpf_prog_type { > + BPF_PROG_TYPE_UNSPEC, > +}; > + > +/* integer value in 'imm' field of BPF_CALL instruction selects which helper > + * function eBPF program intends to call > + */ > +enum bpf_func_id { > + BPF_FUNC_unspec, > + __BPF_FUNC_MAX_ID, > +}; > + > #endif /* _UAPI__LINUX_BPF_H__ */ > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 265a02cc822d..e65ecdc36358 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -23,6 +23,7 @@ > #include <linux/filter.h> > #include <linux/skbuff.h> > #include <asm/unaligned.h> > +#include <linux/bpf.h> > > /* Registers */ > #define BPF_R0 regs[BPF_REG_0] > @@ -528,9 +529,11 @@ void sk_filter_select_runtime(struct sk_filter *fp) > } > EXPORT_SYMBOL_GPL(sk_filter_select_runtime); > > -/* free internal BPF program */ > +/* free internal BPF program, called after RCU grace period */ > void sk_filter_free(struct sk_filter *fp) > { > + if (fp->ebpf) > + free_bpf_prog_info(fp->info); > bpf_jit_free(fp); > } > EXPORT_SYMBOL_GPL(sk_filter_free); > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index ca2be66845b3..9e45ca6b6937 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -14,6 +14,8 @@ > #include <net/netlink.h> > #include <linux/anon_inodes.h> > #include <linux/file.h> > +#include <linux/license.h> > +#include <linux/filter.h> > > /* mutex to protect insertion/deletion of map_id in IDR */ > static DEFINE_MUTEX(bpf_map_lock); > @@ -406,6 +408,310 @@ err_unlock: > return err; > } > > +static LIST_HEAD(bpf_prog_types); > + > +static int find_prog_type(enum bpf_prog_type type, struct sk_filter *prog) > +{ > + struct bpf_prog_type_list *tl; > + > + list_for_each_entry(tl, &bpf_prog_types, list_node) { > + if (tl->type == type) { > + prog->info->ops = tl->ops; > + prog->info->prog_type = type; > + return 0; > + } > + } > + return -EINVAL; > +} > + > +void bpf_register_prog_type(struct bpf_prog_type_list *tl) > +{ > + list_add(&tl->list_node, &bpf_prog_types); > +} > + > +/* fixup insn->imm field of bpf_call instructions: > + * if (insn->imm == BPF_FUNC_map_lookup_elem) > + * insn->imm = bpf_map_lookup_elem - __bpf_call_base; > + * else if (insn->imm == BPF_FUNC_map_update_elem) > + * insn->imm = bpf_map_update_elem - __bpf_call_base; > + * else ... > + * > + * this function is called after eBPF program passed verification > + */ > +static void fixup_bpf_calls(struct sk_filter *prog) > +{ > + const struct bpf_func_proto *fn; > + int i; > + > + for (i = 0; i < prog->len; i++) { > + struct bpf_insn *insn = &prog->insnsi[i]; > + > + if (insn->code == (BPF_JMP | BPF_CALL)) { > + /* we reach here when program has bpf_call instructions > + * and it passed bpf_check(), means that > + * ops->get_func_proto must have been supplied, check it > + */ > + BUG_ON(!prog->info->ops->get_func_proto); > + > + fn = prog->info->ops->get_func_proto(insn->imm); > + /* all functions that have prototype and verifier allowed > + * programs to call them, must be real in-kernel functions > + */ > + BUG_ON(!fn->func); > + insn->imm = fn->func - __bpf_call_base; > + } > + } > +} > + > +/* fixup instructions that are using map_ids: > + * > + * BPF_MOV64_IMM(BPF_REG_1, MAP_ID), // r1 = MAP_ID > + * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), > + * > + * in the 1st insn kernel replaces MAP_ID with global map_id, > + * since programs are executing out of different contexts and must use > + * globally visible ids to access maps > + * > + * map_fixup is an array of pairs {insn idx, map ufd} > + * > + * kernel resolves ufd -> global map_id and adjusts eBPF instructions > + */ > +static int fixup_bpf_map_id(struct sk_filter *prog, struct nlattr *map_fixup) > +{ > + struct { > + u32 insn_idx; > + u32 ufd; > + } *fixup = nla_data(map_fixup); > + int fixup_len = nla_len(map_fixup) / sizeof(*fixup); > + struct bpf_insn *insn; > + struct fd f; > + u32 idx; > + int i, map_id; > + > + if (fixup_len <= 0) > + return -EINVAL; > + > + for (i = 0; i < fixup_len; i++) { > + idx = fixup[i].insn_idx; > + if (idx >= prog->len) > + return -EINVAL; > + > + insn = &prog->insnsi[idx]; > + if (insn->code != (BPF_ALU64 | BPF_MOV | BPF_K) && > + insn->code != (BPF_ALU | BPF_MOV | BPF_K)) > + return -EINVAL; > + > + f = fdget(fixup[i].ufd); > + > + map_id = get_map_id(f); > + > + if (map_id < 0) > + return map_id; > + > + insn->imm = map_id; > + fdput(f); It looks like there a potentially race risk of a map_id changing out from under a running program? Between the call to fixup_bpf_map_id() and the bpf_map_get() calls during bpf_prog_load() below... > + } > + return 0; > +} > + > +/* free eBPF program auxilary data, called after rcu grace period, > + * so it's safe to drop refcnt on maps used by this program > + * > + * called from sk_filter_release()->sk_filter_release_rcu()->sk_filter_free() > + */ > +void free_bpf_prog_info(struct bpf_prog_info *info) > +{ > + bool found; > + int i; > + > + for (i = 0; i < info->used_map_cnt; i++) { > + found = bpf_map_put(info->used_maps[i]); > + /* all maps that this program was using should obviously still > + * be there > + */ > + BUG_ON(!found); > + } > + kfree(info); > +} > + > +static int bpf_prog_release(struct inode *inode, struct file *filp) > +{ > + struct sk_filter *prog = filp->private_data; > + > + sk_unattached_filter_destroy(prog); > + return 0; > +} > + > +static const struct file_operations bpf_prog_fops = { > + .release = bpf_prog_release, > +}; > + > +static const struct nla_policy prog_policy[BPF_PROG_ATTR_MAX + 1] = { > + [BPF_PROG_TEXT] = { .type = NLA_BINARY }, > + [BPF_PROG_LICENSE] = { .type = NLA_NUL_STRING }, > + [BPF_PROG_MAP_FIXUP] = { .type = NLA_BINARY }, > +}; > + > +static int bpf_prog_load(enum bpf_prog_type type, struct nlattr __user *uattr, > + int len) > +{ > + struct nlattr *tb[BPF_PROG_ATTR_MAX + 1]; > + struct sk_filter *prog; > + struct bpf_map *map; > + struct nlattr *attr; > + size_t insn_len; > + int err, i; > + bool is_gpl; > + > + if (len <= 0 || len > BPF_PROG_MAX_ATTR_SIZE) > + return -EINVAL; > + > + attr = kmalloc(len, GFP_USER); > + if (!attr) > + return -ENOMEM; > + > + /* copy eBPF program from user space */ > + err = -EFAULT; > + if (copy_from_user(attr, uattr, len) != 0) > + goto free_attr; > + > + /* perform basic validation */ > + err = nla_parse(tb, BPF_PROG_ATTR_MAX, attr, len, prog_policy); > + if (err < 0) > + goto free_attr; > + > + err = -EINVAL; > + /* look for mandatory license string */ > + if (!tb[BPF_PROG_LICENSE]) > + goto free_attr; > + > + /* eBPF programs must be GPL compatible to use GPL-ed functions */ > + is_gpl = license_is_gpl_compatible(nla_data(tb[BPF_PROG_LICENSE])); > + > + /* look for mandatory array of eBPF instructions */ > + if (!tb[BPF_PROG_TEXT]) > + goto free_attr; > + > + insn_len = nla_len(tb[BPF_PROG_TEXT]); > + if (insn_len % sizeof(struct bpf_insn) != 0 || insn_len <= 0) > + goto free_attr; > + > + /* plain sk_filter allocation */ > + err = -ENOMEM; > + prog = kmalloc(sk_filter_size(insn_len), GFP_USER); > + if (!prog) > + goto free_attr; > + > + prog->len = insn_len / sizeof(struct bpf_insn); > + memcpy(prog->insns, nla_data(tb[BPF_PROG_TEXT]), insn_len); > + prog->orig_prog = NULL; > + prog->jited = 0; > + prog->ebpf = 0; > + atomic_set(&prog->refcnt, 1); > + > + if (tb[BPF_PROG_MAP_FIXUP]) { > + /* if program is using maps, fixup map_ids */ > + err = fixup_bpf_map_id(prog, tb[BPF_PROG_MAP_FIXUP]); > + if (err < 0) > + goto free_prog; > + } > + > + /* allocate eBPF related auxilary data */ > + prog->info = kzalloc(sizeof(struct bpf_prog_info), GFP_USER); > + if (!prog->info) > + goto free_prog; > + prog->ebpf = 1; > + prog->info->is_gpl_compatible = is_gpl; > + > + /* find program type: socket_filter vs tracing_filter */ > + err = find_prog_type(type, prog); > + if (err < 0) > + goto free_prog; > + > + /* lock maps to prevent any changes to maps, since eBPF program may > + * use them. In such case bpf_check() will populate prog->used_maps > + */ > + mutex_lock(&bpf_map_lock); > + > + /* run eBPF verifier */ > + /* err = bpf_check(prog); */ > + > + if (err == 0 && prog->info->used_maps) { > + /* program passed verifier and it's using some maps, > + * hold them > + */ > + for (i = 0; i < prog->info->used_map_cnt; i++) { > + map = bpf_map_get(prog->info->used_maps[i]); > + BUG_ON(!map); > + atomic_inc(&map->refcnt); > + } > + } > + mutex_unlock(&bpf_map_lock); As mentioned above, I think fixup_bpf_map_id needs to be done under the map_lock mutex, unless I'm misunderstanding something in the object lifetime. > + > + if (err < 0) > + goto free_prog; > + > + /* fixup BPF_CALL->imm field */ > + fixup_bpf_calls(prog); > + > + /* eBPF program is ready to be JITed */ > + sk_filter_select_runtime(prog); > + > + err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); > + > + if (err < 0) > + /* failed to allocate fd */ > + goto free_prog; > + > + /* user supplied eBPF prog attributes are no longer needed */ > + kfree(attr); > + > + return err; > +free_prog: > + sk_filter_free(prog); > +free_attr: > + kfree(attr); > + return err; > +} > + > +static struct sk_filter *get_prog(struct fd f) > +{ > + struct sk_filter *prog; > + > + if (!f.file) > + return ERR_PTR(-EBADF); > + > + if (f.file->f_op != &bpf_prog_fops) { > + fdput(f); > + return ERR_PTR(-EINVAL); > + } > + > + prog = f.file->private_data; > + > + return prog; > +} > + > +/* called from sk_attach_filter_ebpf() or from tracing filter attach > + * pairs with > + * sk_detach_filter()->sk_filter_uncharge()->sk_filter_release() > + * or with > + * sk_unattached_filter_destroy()->sk_filter_release() > + */ > +struct sk_filter *bpf_prog_get(u32 ufd) > +{ > + struct fd f = fdget(ufd); > + struct sk_filter *prog; > + > + prog = get_prog(f); > + > + if (IS_ERR(prog)) > + return prog; > + > + atomic_inc(&prog->refcnt); > + fdput(f); > + return prog; > +} > + > SYSCALL_DEFINE5(bpf, int, cmd, unsigned long, arg2, unsigned long, arg3, > unsigned long, arg4, unsigned long, arg5) > { > @@ -428,6 +734,9 @@ SYSCALL_DEFINE5(bpf, int, cmd, unsigned long, arg2, unsigned long, arg3, > case BPF_MAP_GET_NEXT_KEY: > return map_get_next_key((int) arg2, (void __user *) arg3, > (void __user *) arg4); > + case BPF_PROG_LOAD: > + return bpf_prog_load((enum bpf_prog_type) arg2, > + (struct nlattr __user *) arg3, (int) arg4); > default: > return -EINVAL; > } > diff --git a/net/core/filter.c b/net/core/filter.c > index f3b2d5e9fe5f..255dba1bb678 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -835,7 +835,7 @@ static void sk_release_orig_filter(struct sk_filter *fp) > { > struct sock_fprog_kern *fprog = fp->orig_prog; > > - if (fprog) { > + if (!fp->ebpf && fprog) { > kfree(fprog->filter); > kfree(fprog); > } > @@ -867,14 +867,16 @@ static void sk_filter_release(struct sk_filter *fp) > > void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) > { > - atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc); > + if (!fp->ebpf) > + atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc); > sk_filter_release(fp); > } > > void sk_filter_charge(struct sock *sk, struct sk_filter *fp) > { > atomic_inc(&fp->refcnt); > - atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc); > + if (!fp->ebpf) > + atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc); > } > > static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp, > @@ -978,6 +980,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, > > fp->bpf_func = NULL; > fp->jited = 0; > + fp->ebpf = 0; > > err = sk_chk_filter(fp->insns, fp->len); > if (err) { > -- > 1.7.9.5 > -Kees -- Kees Cook Chrome OS Security -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html