On Thu, Feb 28, 2019 at 3:31 PM Daniel Borkmann <daniel@xxxxxxxxxxxxx> wrote: > > This generic extension to BPF maps allows for directly loading an > address residing inside a BPF map value as a single BPF ldimm64 > instruction. > > The idea is similar to what BPF_PSEUDO_MAP_FD does today, which > is a special src_reg flag for ldimm64 instruction that indicates > that inside the first part of the double insns's imm field is a > file descriptor which the verifier then replaces as a full 64bit > address of the map into both imm parts. > > For the newly added BPF_PSEUDO_MAP_VALUE src_reg flag, the idea > is similar: the first part of the double insns's imm field is > again a file descriptor corresponding to the map, and the second > part of the imm field is an offset. The verifier will then replace > both imm parts with an address that points into the BPF map value > for maps that support this operation. BPF_PSEUDO_MAP_VALUE is a > distinct flag as otherwise with BPF_PSEUDO_MAP_FD we could not > differ offset 0 between load of map pointer versus load of map's > value at offset 0. > > This allows for efficiently retrieving an address to a map value > memory area without having to issue a helper call which needs to > prepare registers according to calling convention, etc, without > needing the extra NULL test, and without having to add the offset > in an additional instruction to the value base pointer. > > The verifier then treats the destination register as PTR_TO_MAP_VALUE > with constant reg->off from the user passed offset from the second > imm field, and guarantees that this is within bounds of the map > value. Any subsequent operations are normally treated as typical > map value handling without anything else needed for verification. > > The two map operations for direct value access have been added to > array map for now. In future other types could be supported as > well depending on the use case. The main use case for this commit > is to allow for BPF loader support for global variables that > reside in .data/.rodata/.bss sections such that we can directly > load the address of them with minimal additional infrastructure > required. Loader support has been added in subsequent commits for > libbpf library. > > Signed-off-by: Daniel Borkmann <daniel@xxxxxxxxxxxxx> > --- > include/linux/bpf.h | 6 +++ > include/linux/bpf_verifier.h | 4 ++ > include/uapi/linux/bpf.h | 6 ++- > kernel/bpf/arraymap.c | 33 ++++++++++++++ > kernel/bpf/core.c | 3 +- > kernel/bpf/disasm.c | 5 ++- > kernel/bpf/syscall.c | 29 +++++++++--- > kernel/bpf/verifier.c | 73 +++++++++++++++++++++++-------- > tools/bpf/bpftool/xlated_dumper.c | 3 ++ > tools/include/uapi/linux/bpf.h | 6 ++- > 10 files changed, 138 insertions(+), 30 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index a2132e09dc1c..bdcc6e2a9977 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -57,6 +57,12 @@ struct bpf_map_ops { > const struct btf *btf, > const struct btf_type *key_type, > const struct btf_type *value_type); > + > + /* Direct value access helpers. */ > + int (*map_direct_value_access)(const struct bpf_map *map, > + u32 off, u64 *imm); > + int (*map_direct_value_offset)(const struct bpf_map *map, > + u64 imm, u32 *off); > }; > > struct bpf_map { > diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h > index 69f7a3449eda..6e28f1c24710 100644 > --- a/include/linux/bpf_verifier.h > +++ b/include/linux/bpf_verifier.h > @@ -183,6 +183,10 @@ struct bpf_insn_aux_data { > unsigned long map_state; /* pointer/poison value for maps */ > s32 call_imm; /* saved imm field of call insn */ > u32 alu_limit; /* limit for add/sub register with pointer */ > + struct { > + u32 map_index; /* index into used_maps[] */ > + u32 map_off; /* offset from value base address */ > + }; > }; > int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ > int sanitize_stack_off; /* stack slot to be cleared */ > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 2e308e90ffea..8884072e1a46 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -255,8 +255,12 @@ enum bpf_attach_type { > */ > #define BPF_F_ANY_ALIGNMENT (1U << 1) > > -/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ > +/* When bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_{FD,VALUE}, then > + * bpf_ldimm64's insn[0]->imm == fd in both cases. Additionally, > + * for BPF_PSEUDO_MAP_VALUE, insn[1]->imm == offset into value. > + */ > #define BPF_PSEUDO_MAP_FD 1 > +#define BPF_PSEUDO_MAP_VALUE 2 > > /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative > * offset to another bpf function > diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c > index c72e0d8e1e65..3e5969c0c979 100644 > --- a/kernel/bpf/arraymap.c > +++ b/kernel/bpf/arraymap.c > @@ -160,6 +160,37 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) > return array->value + array->elem_size * (index & array->index_mask); > } > > +static int array_map_direct_value_access(const struct bpf_map *map, u32 off, > + u64 *imm) > +{ > + struct bpf_array *array = container_of(map, struct bpf_array, map); > + > + if (map->max_entries != 1) > + return -ENOTSUPP; > + if (off >= map->value_size) > + return -EINVAL; > + > + *imm = (unsigned long)array->value; > + return 0; > +} > + > +static int array_map_direct_value_offset(const struct bpf_map *map, u64 imm, > + u32 *off) > +{ > + struct bpf_array *array = container_of(map, struct bpf_array, map); > + unsigned long range = map->value_size; > + unsigned long base = array->value; > + unsigned long addr = imm; > + > + if (map->max_entries != 1) > + return -ENOENT; > + if (addr < base || addr >= base + range) > + return -ENOENT; > + > + *off = addr - base; > + return 0; > +} > + > /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ > static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) > { > @@ -419,6 +450,8 @@ const struct bpf_map_ops array_map_ops = { > .map_update_elem = array_map_update_elem, > .map_delete_elem = array_map_delete_elem, > .map_gen_lookup = array_map_gen_lookup, > + .map_direct_value_access = array_map_direct_value_access, > + .map_direct_value_offset = array_map_direct_value_offset, > .map_seq_show_elem = array_map_seq_show_elem, > .map_check_btf = array_map_check_btf, > }; > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 1c14c347f3cf..49fc0ff14537 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -286,7 +286,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) > dst[i] = fp->insnsi[i]; > if (!was_ld_map && > dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && > - dst[i].src_reg == BPF_PSEUDO_MAP_FD) { > + (dst[i].src_reg == BPF_PSEUDO_MAP_FD || > + dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { > was_ld_map = true; > dst[i].imm = 0; > } else if (was_ld_map && > diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c > index de73f55e42fd..d9ce383c0f9c 100644 > --- a/kernel/bpf/disasm.c > +++ b/kernel/bpf/disasm.c > @@ -205,10 +205,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, > * part of the ldimm64 insn is accessible. > */ > u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; > - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; > + bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || > + insn->src_reg == BPF_PSEUDO_MAP_VALUE; > char tmp[64]; > > - if (map_ptr && !allow_ptr_leaks) > + if (is_ptr && !allow_ptr_leaks) > imm = 0; > > verbose(cbs->private_data, "(%02x) r%d = %s\n", > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 174581dfe225..d3ef45e01d7a 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -2061,13 +2061,27 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) > } > > static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, > - unsigned long addr) > + unsigned long addr, u32 *off, > + u32 *type) > { > + const struct bpf_map *map; > int i; > > - for (i = 0; i < prog->aux->used_map_cnt; i++) > - if (prog->aux->used_maps[i] == (void *)addr) > - return prog->aux->used_maps[i]; > + *off = *type = 0; > + for (i = 0; i < prog->aux->used_map_cnt; i++) { > + map = prog->aux->used_maps[i]; > + if (map == (void *)addr) { > + *type = BPF_PSEUDO_MAP_FD; > + return map; > + } > + if (!map->ops->map_direct_value_offset) > + continue; > + if (!map->ops->map_direct_value_offset(map, addr, off)) { > + *type = BPF_PSEUDO_MAP_VALUE; > + return map; > + } > + } > + > return NULL; > } > > @@ -2075,6 +2089,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) > { > const struct bpf_map *map; > struct bpf_insn *insns; > + u32 off, type; > u64 imm; > int i; > > @@ -2102,11 +2117,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) > continue; > > imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; > - map = bpf_map_from_imm(prog, imm); > + map = bpf_map_from_imm(prog, imm, &off, &type); > if (map) { > - insns[i].src_reg = BPF_PSEUDO_MAP_FD; > + insns[i].src_reg = type; > insns[i].imm = map->id; > - insns[i + 1].imm = 0; > + insns[i + 1].imm = off; > continue; > } > } > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 0e4edd7e3c5f..3ad05dda6e9d 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -4944,18 +4944,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, > return 0; > } > > -/* return the map pointer stored inside BPF_LD_IMM64 instruction */ > -static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) > -{ > - u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; > - > - return (struct bpf_map *) (unsigned long) imm64; > -} > - > /* verify BPF_LD_IMM64 instruction */ > static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) > { > + struct bpf_insn_aux_data *aux = cur_aux(env); > struct bpf_reg_state *regs = cur_regs(env); > + struct bpf_map *map; > int err; > > if (BPF_SIZE(insn->code) != BPF_DW) { > @@ -4979,11 +4973,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) > return 0; > } > > - /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ > - BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); > + map = env->used_maps[aux->map_index]; > + mark_reg_known_zero(env, regs, insn->dst_reg); > + regs[insn->dst_reg].map_ptr = map; > + > + if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { > + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; > + regs[insn->dst_reg].off = aux->map_off; > + if (map_value_has_spin_lock(map)) > + regs[insn->dst_reg].id = ++env->id_gen; > + } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { > + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; > + } else { > + verbose(env, "bpf verifier is misconfigured\n"); > + return -EINVAL; > + } > > - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; > - regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); > return 0; > } > > @@ -6664,8 +6669,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) > } > > if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { > + struct bpf_insn_aux_data *aux; > struct bpf_map *map; > struct fd f; > + u64 addr; > > if (i == insn_cnt - 1 || insn[1].code != 0 || > insn[1].dst_reg != 0 || insn[1].src_reg != 0 || Next line after this one rejects ldimm64 instructions with off != 0. This check needs to be changed, depending on whether src_reg == BPF_PSEUDO_MAP_VALUE, right? This is also to the previously discussed question of not enforcing offset (imm=0 in 2nd part of insn) for BPF_PSEUDO_MAP_FD. Seems like verifier *does* enforce that (not that I'm advocating for re-using BPF_PSEUDO_MAP_FD, just stumbled on this bit when going through verifier code). > @@ -6677,8 +6684,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) > if (insn->src_reg == 0) > /* valid generic load 64-bit imm */ > goto next_insn; > - > - if (insn->src_reg != BPF_PSEUDO_MAP_FD) { > + if (insn->src_reg != BPF_PSEUDO_MAP_FD && > + insn->src_reg != BPF_PSEUDO_MAP_VALUE) { > verbose(env, > "unrecognized bpf_ld_imm64 insn\n"); > return -EINVAL; > @@ -6698,16 +6705,44 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) > return err; > } > > - /* store map pointer inside BPF_LD_IMM64 instruction */ > - insn[0].imm = (u32) (unsigned long) map; > - insn[1].imm = ((u64) (unsigned long) map) >> 32; > + aux = &env->insn_aux_data[i]; > + if (insn->src_reg == BPF_PSEUDO_MAP_FD) { > + addr = (unsigned long)map; > + } else { > + u32 off = insn[1].imm; > + > + if (off >= BPF_MAX_VAR_OFF) { > + verbose(env, "direct value offset of %u is not allowed\n", > + off); > + return -EINVAL; > + } > + if (!map->ops->map_direct_value_access) { > + verbose(env, "no direct value access support for this map type\n"); > + return -EINVAL; > + } > + > + err = map->ops->map_direct_value_access(map, off, &addr); > + if (err) { > + verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n", > + map->value_size, off); > + return err; > + } > + > + aux->map_off = off; > + addr += off; > + } > + > + insn[0].imm = (u32)addr; > + insn[1].imm = addr >> 32; > > /* check whether we recorded this map already */ > - for (j = 0; j < env->used_map_cnt; j++) > + for (j = 0; j < env->used_map_cnt; j++) { > if (env->used_maps[j] == map) { > + aux->map_index = j; > fdput(f); > goto next_insn; > } > + } > > if (env->used_map_cnt >= MAX_USED_MAPS) { > fdput(f); > @@ -6724,6 +6759,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) > fdput(f); > return PTR_ERR(map); > } > + > + aux->map_index = env->used_map_cnt; > env->used_maps[env->used_map_cnt++] = map; > > if (bpf_map_is_cgroup_storage(map) && > diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c > index 7073dbe1ff27..0bb17bf88b18 100644 > --- a/tools/bpf/bpftool/xlated_dumper.c > +++ b/tools/bpf/bpftool/xlated_dumper.c > @@ -195,6 +195,9 @@ static const char *print_imm(void *private_data, > if (insn->src_reg == BPF_PSEUDO_MAP_FD) > snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), > "map[id:%u]", insn->imm); > + else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) > + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), > + "map[id:%u][0]+%u", insn->imm, (insn + 1)->imm); > else > snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), > "0x%llx", (unsigned long long)full_imm); > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index 2e308e90ffea..8884072e1a46 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -255,8 +255,12 @@ enum bpf_attach_type { > */ > #define BPF_F_ANY_ALIGNMENT (1U << 1) > > -/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ > +/* When bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_{FD,VALUE}, then > + * bpf_ldimm64's insn[0]->imm == fd in both cases. Additionally, > + * for BPF_PSEUDO_MAP_VALUE, insn[1]->imm == offset into value. > + */ > #define BPF_PSEUDO_MAP_FD 1 > +#define BPF_PSEUDO_MAP_VALUE 2 > > /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative > * offset to another bpf function > -- > 2.17.1 >