On Wed, Oct 7, 2015 at 10:23 PM, Alexei Starovoitov <ast@xxxxxxxxxxxx> wrote: > In order to let unprivileged users load and execute eBPF programs > teach verifier to prevent pointer leaks. > Verifier will prevent > - any arithmetic on pointers > (except R10+Imm which is used to compute stack addresses) > - comparison of pointers > (except if (map_value_ptr == 0) ... ) > - passing pointers to helper functions > - indirectly passing pointers in stack to helper functions > - returning pointer from bpf program > - storing pointers into ctx or maps > > Spill/fill of pointers into stack is allowed, but mangling > of pointers stored in the stack or reading them byte by byte is not. > > Within bpf programs the pointers do exist, since programs need to > be able to access maps, pass skb pointer to LD_ABS insns, etc > but programs cannot pass such pointer values to the outside > or obfuscate them. > > Only allow BPF_PROG_TYPE_SOCKET_FILTER unprivileged programs, > so that socket filters (tcpdump), af_packet (quic acceleration) > and future kcm can use it. > tracing and tc cls/act program types still require root permissions, > since tracing actually needs to be able to see all kernel pointers > and tc is for root only. > > For example, the following unprivileged socket filter program is allowed: > int bpf_prog1(struct __sk_buff *skb) > { > u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); > u64 *value = bpf_map_lookup_elem(&my_map, &index); > > if (value) > *value += skb->len; > return 0; > } > > but the following program is not: > int bpf_prog1(struct __sk_buff *skb) > { > u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); > u64 *value = bpf_map_lookup_elem(&my_map, &index); > > if (value) > *value += (u64) skb; > return 0; > } > since it would leak the kernel address into the map. > > Unprivileged socket filter bpf programs have access to the > following helper functions: > - map lookup/update/delete (but they cannot store kernel pointers into them) > - get_random (it's already exposed to unprivileged user space) > - get_smp_processor_id > - tail_call into another socket filter program > - ktime_get_ns > > The feature is controlled by sysctl kernel.unprivileged_bpf_disabled. > This toggle defaults to off (0), but can be set true (1). Once true, > bpf programs and maps cannot be accessed from unprivileged process, > and the toggle cannot be set back to false. > > Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxxxx> Reviewed-by: Kees Cook <keescook@xxxxxxxxxxxx> Thanks for making this safer! :) -Kees > --- > v1->v2: > - sysctl_unprivileged_bpf_disabled > - drop bpf_trace_printk > - split tests into separate patch to ease review > --- > include/linux/bpf.h | 2 + > kernel/bpf/syscall.c | 11 ++--- > kernel/bpf/verifier.c | 106 ++++++++++++++++++++++++++++++++++++++++++++----- > kernel/sysctl.c | 13 ++++++ > net/core/filter.c | 3 +- > 5 files changed, 120 insertions(+), 15 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 19b8a2081f88..e472b06df138 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -167,6 +167,8 @@ void bpf_prog_put_rcu(struct bpf_prog *prog); > struct bpf_map *bpf_map_get(struct fd f); > void bpf_map_put(struct bpf_map *map); > > +extern int sysctl_unprivileged_bpf_disabled; > + > /* verify correctness of eBPF program */ > int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); > #else > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 5f35f420c12f..9f824b0f0f5f 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -18,6 +18,8 @@ > #include <linux/filter.h> > #include <linux/version.h> > > +int sysctl_unprivileged_bpf_disabled __read_mostly; > + > static LIST_HEAD(bpf_map_types); > > static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) > @@ -542,6 +544,9 @@ static int bpf_prog_load(union bpf_attr *attr) > attr->kern_version != LINUX_VERSION_CODE) > return -EINVAL; > > + if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) > + return -EPERM; > + > /* plain bpf_prog allocation */ > prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); > if (!prog) > @@ -597,11 +602,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz > union bpf_attr attr = {}; > int err; > > - /* the syscall is limited to root temporarily. This restriction will be > - * lifted when security audit is clean. Note that eBPF+tracing must have > - * this restriction, since it may pass kernel data to user space > - */ > - if (!capable(CAP_SYS_ADMIN)) > + if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) > return -EPERM; > > if (!access_ok(VERIFY_READ, uattr, 1)) > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index f8da034c2258..1d6b97be79e1 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -199,6 +199,7 @@ struct verifier_env { > struct verifier_state_list **explored_states; /* search pruning optimization */ > struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ > u32 used_map_cnt; /* number of used maps */ > + bool allow_ptr_leaks; > }; > > /* verbose verifier prints what it's seeing > @@ -538,6 +539,21 @@ static int bpf_size_to_bytes(int bpf_size) > return -EINVAL; > } > > +static bool is_spillable_regtype(enum bpf_reg_type type) > +{ > + switch (type) { > + case PTR_TO_MAP_VALUE: > + case PTR_TO_MAP_VALUE_OR_NULL: > + case PTR_TO_STACK: > + case PTR_TO_CTX: > + case FRAME_PTR: > + case CONST_PTR_TO_MAP: > + return true; > + default: > + return false; > + } > +} > + > /* check_stack_read/write functions track spill/fill of registers, > * stack boundary and alignment are checked in check_mem_access() > */ > @@ -550,9 +566,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, > */ > > if (value_regno >= 0 && > - (state->regs[value_regno].type == PTR_TO_MAP_VALUE || > - state->regs[value_regno].type == PTR_TO_STACK || > - state->regs[value_regno].type == PTR_TO_CTX)) { > + is_spillable_regtype(state->regs[value_regno].type)) { > > /* register containing pointer is being spilled into stack */ > if (size != BPF_REG_SIZE) { > @@ -643,6 +657,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, > return -EACCES; > } > > +static bool is_pointer_value(struct verifier_env *env, int regno) > +{ > + if (env->allow_ptr_leaks) > + return false; > + > + switch (env->cur_state.regs[regno].type) { > + case UNKNOWN_VALUE: > + case CONST_IMM: > + return false; > + default: > + return true; > + } > +} > + > /* check whether memory at (regno + off) is accessible for t = (read | write) > * if t==write, value_regno is a register which value is stored into memory > * if t==read, value_regno is a register which will receive the value from memory > @@ -669,11 +697,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, > } > > if (state->regs[regno].type == PTR_TO_MAP_VALUE) { > + if (t == BPF_WRITE && value_regno >= 0 && > + is_pointer_value(env, value_regno)) { > + verbose("R%d leaks addr into map\n", value_regno); > + return -EACCES; > + } > err = check_map_access(env, regno, off, size); > if (!err && t == BPF_READ && value_regno >= 0) > mark_reg_unknown_value(state->regs, value_regno); > > } else if (state->regs[regno].type == PTR_TO_CTX) { > + if (t == BPF_WRITE && value_regno >= 0 && > + is_pointer_value(env, value_regno)) { > + verbose("R%d leaks addr into ctx\n", value_regno); > + return -EACCES; > + } > err = check_ctx_access(env, off, size, t); > if (!err && t == BPF_READ && value_regno >= 0) > mark_reg_unknown_value(state->regs, value_regno); > @@ -684,10 +722,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, > verbose("invalid stack off=%d size=%d\n", off, size); > return -EACCES; > } > - if (t == BPF_WRITE) > + if (t == BPF_WRITE) { > + if (!env->allow_ptr_leaks && > + state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && > + size != BPF_REG_SIZE) { > + verbose("attempt to corrupt spilled pointer on stack\n"); > + return -EACCES; > + } > err = check_stack_write(state, off, size, value_regno); > - else > + } else { > err = check_stack_read(state, off, size, value_regno); > + } > } else { > verbose("R%d invalid mem access '%s'\n", > regno, reg_type_str[state->regs[regno].type]); > @@ -775,8 +820,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, > return -EACCES; > } > > - if (arg_type == ARG_ANYTHING) > + if (arg_type == ARG_ANYTHING) { > + if (is_pointer_value(env, regno)) { > + verbose("R%d leaks addr into helper function\n", regno); > + return -EACCES; > + } > return 0; > + } > > if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || > arg_type == ARG_PTR_TO_MAP_VALUE) { > @@ -950,8 +1000,9 @@ static int check_call(struct verifier_env *env, int func_id) > } > > /* check validity of 32-bit and 64-bit arithmetic operations */ > -static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) > +static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) > { > + struct reg_state *regs = env->cur_state.regs; > u8 opcode = BPF_OP(insn->code); > int err; > > @@ -976,6 +1027,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) > if (err) > return err; > > + if (is_pointer_value(env, insn->dst_reg)) { > + verbose("R%d pointer arithmetic prohibited\n", > + insn->dst_reg); > + return -EACCES; > + } > + > /* check dest operand */ > err = check_reg_arg(regs, insn->dst_reg, DST_OP); > if (err) > @@ -1012,6 +1069,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) > */ > regs[insn->dst_reg] = regs[insn->src_reg]; > } else { > + if (is_pointer_value(env, insn->src_reg)) { > + verbose("R%d partial copy of pointer\n", > + insn->src_reg); > + return -EACCES; > + } > regs[insn->dst_reg].type = UNKNOWN_VALUE; > regs[insn->dst_reg].map_ptr = NULL; > } > @@ -1061,8 +1123,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) > /* pattern match 'bpf_add Rx, imm' instruction */ > if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && > regs[insn->dst_reg].type == FRAME_PTR && > - BPF_SRC(insn->code) == BPF_K) > + BPF_SRC(insn->code) == BPF_K) { > stack_relative = true; > + } else if (is_pointer_value(env, insn->dst_reg)) { > + verbose("R%d pointer arithmetic prohibited\n", > + insn->dst_reg); > + return -EACCES; > + } else if (BPF_SRC(insn->code) == BPF_X && > + is_pointer_value(env, insn->src_reg)) { > + verbose("R%d pointer arithmetic prohibited\n", > + insn->src_reg); > + return -EACCES; > + } > > /* check dest operand */ > err = check_reg_arg(regs, insn->dst_reg, DST_OP); > @@ -1101,6 +1173,12 @@ static int check_cond_jmp_op(struct verifier_env *env, > err = check_reg_arg(regs, insn->src_reg, SRC_OP); > if (err) > return err; > + > + if (is_pointer_value(env, insn->src_reg)) { > + verbose("R%d pointer comparison prohibited\n", > + insn->src_reg); > + return -EACCES; > + } > } else { > if (insn->src_reg != BPF_REG_0) { > verbose("BPF_JMP uses reserved fields\n"); > @@ -1155,6 +1233,9 @@ static int check_cond_jmp_op(struct verifier_env *env, > regs[insn->dst_reg].type = CONST_IMM; > regs[insn->dst_reg].imm = 0; > } > + } else if (is_pointer_value(env, insn->dst_reg)) { > + verbose("R%d pointer comparison prohibited\n", insn->dst_reg); > + return -EACCES; > } else if (BPF_SRC(insn->code) == BPF_K && > (opcode == BPF_JEQ || opcode == BPF_JNE)) { > > @@ -1658,7 +1739,7 @@ static int do_check(struct verifier_env *env) > } > > if (class == BPF_ALU || class == BPF_ALU64) { > - err = check_alu_op(regs, insn); > + err = check_alu_op(env, insn); > if (err) > return err; > > @@ -1816,6 +1897,11 @@ static int do_check(struct verifier_env *env) > if (err) > return err; > > + if (is_pointer_value(env, BPF_REG_0)) { > + verbose("R0 leaks addr as return value\n"); > + return -EACCES; > + } > + > process_bpf_exit: > insn_idx = pop_stack(env, &prev_insn_idx); > if (insn_idx < 0) { > @@ -2144,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) > if (ret < 0) > goto skip_full_check; > > + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); > + > ret = do_check(env); > > skip_full_check: > diff --git a/kernel/sysctl.c b/kernel/sysctl.c > index e69201d8094e..96c856b04081 100644 > --- a/kernel/sysctl.c > +++ b/kernel/sysctl.c > @@ -64,6 +64,7 @@ > #include <linux/binfmts.h> > #include <linux/sched/sysctl.h> > #include <linux/kexec.h> > +#include <linux/bpf.h> > > #include <asm/uaccess.h> > #include <asm/processor.h> > @@ -1139,6 +1140,18 @@ static struct ctl_table kern_table[] = { > .proc_handler = timer_migration_handler, > }, > #endif > +#ifdef CONFIG_BPF_SYSCALL > + { > + .procname = "unprivileged_bpf_disabled", > + .data = &sysctl_unprivileged_bpf_disabled, > + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), > + .mode = 0644, > + /* only handle a transition from default "0" to "1" */ > + .proc_handler = proc_dointvec_minmax, > + .extra1 = &one, > + .extra2 = &one, > + }, > +#endif > { } > }; > > diff --git a/net/core/filter.c b/net/core/filter.c > index cbaa23c3fb30..8fb3acbbe4cb 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -1644,7 +1644,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) > case BPF_FUNC_ktime_get_ns: > return &bpf_ktime_get_ns_proto; > case BPF_FUNC_trace_printk: > - return bpf_get_trace_printk_proto(); > + if (capable(CAP_SYS_ADMIN)) > + return bpf_get_trace_printk_proto(); > default: > return NULL; > } > -- > 1.7.9.5 > -- Kees Cook Chrome OS Security -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html