Re: [PATCH v2 bpf-next 14/16] libbpf: Generate loader program out of BPF ELF file.

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Mon, 26 Apr 2021 15:22:36 -0700

On Thu, Apr 22, 2021 at 5:27 PM Alexei Starovoitov
<alexei.starovoitov@xxxxxxxxx> wrote:
>
> From: Alexei Starovoitov <ast@xxxxxxxxxx>
>
> The BPF program loading process performed by libbpf is quite complex
> and consists of the following steps:
> "open" phase:
> - parse elf file and remember relocations, sections
> - collect externs and ksyms including their btf_ids in prog's BTF
> - patch BTF datasec (since llvm couldn't do it)
> - init maps (old style map_def, BTF based, global data map, kconfig map)
> - collect relocations against progs and maps
> "load" phase:
> - probe kernel features
> - load vmlinux BTF
> - resolve externs (kconfig and ksym)
> - load program BTF
> - init struct_ops
> - create maps
> - apply CO-RE relocations
> - patch ld_imm64 insns with src_reg=PSEUDO_MAP, PSEUDO_MAP_VALUE, PSEUDO_BTF_ID
> - reposition subprograms and adjust call insns
> - sanitize and load progs
>
> During this process libbpf does sys_bpf() calls to load BTF, create maps,
> populate maps and finally load programs.
> Instead of actually doing the syscalls generate a trace of what libbpf
> would have done and represent it as the "loader program".
> The "loader program" consists of single map with:
> - union bpf_attr(s)
> - BTF bytes
> - map value bytes
> - insns bytes
> and single bpf program that passes bpf_attr(s) and data into bpf_sys_bpf() helper.
> Executing such "loader program" via bpf_prog_test_run() command will
> replay the sequence of syscalls that libbpf would have done which will result
> the same maps created and programs loaded as specified in the elf file.
> The "loader program" removes libelf and majority of libbpf dependency from
> program loading process.
>
> kconfig, typeless ksym, struct_ops and CO-RE are not supported yet.
>
> The order of relocate_data and relocate_calls had to change, so that
> bpf_gen__prog_load() can see all relocations for a given program with
> correct insn_idx-es.
>
> Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
> ---
>  tools/lib/bpf/Build              |   2 +-
>  tools/lib/bpf/bpf_gen_internal.h |  40 ++
>  tools/lib/bpf/gen_loader.c       | 615 +++++++++++++++++++++++++++++++
>  tools/lib/bpf/libbpf.c           | 204 ++++++++--
>  tools/lib/bpf/libbpf.h           |  12 +
>  tools/lib/bpf/libbpf.map         |   1 +
>  tools/lib/bpf/libbpf_internal.h  |   2 +
>  tools/lib/bpf/skel_internal.h    | 105 ++++++
>  8 files changed, 948 insertions(+), 33 deletions(-)
>  create mode 100644 tools/lib/bpf/bpf_gen_internal.h
>  create mode 100644 tools/lib/bpf/gen_loader.c
>  create mode 100644 tools/lib/bpf/skel_internal.h
>
> diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
> index 9b057cc7650a..430f6874fa41 100644
> --- a/tools/lib/bpf/Build
> +++ b/tools/lib/bpf/Build
> @@ -1,3 +1,3 @@
>  libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \
>             netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \
> -           btf_dump.o ringbuf.o strset.o linker.o
> +           btf_dump.o ringbuf.o strset.o linker.o gen_loader.o
> diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h
> new file mode 100644
> index 000000000000..dc3e2cbf9ce3
> --- /dev/null
> +++ b/tools/lib/bpf/bpf_gen_internal.h
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
> +/* Copyright (c) 2021 Facebook */
> +#ifndef __BPF_GEN_INTERNAL_H
> +#define __BPF_GEN_INTERNAL_H
> +
> +struct relo_desc {

there is very similarly named reloc_desc struct in libbpf.c, can you
rename it to something like gen_btf_relo_desc?

> +       const char *name;
> +       int kind;
> +       int insn_idx;
> +};
> +

[...]

> +
> +static int bpf_gen__realloc_insn_buf(struct bpf_gen *gen, __u32 size)
> +{
> +       size_t off = gen->insn_cur - gen->insn_start;
> +
> +       if (gen->error)
> +               return gen->error;
> +       if (size > INT32_MAX || off + size > INT32_MAX) {
> +               gen->error = -ERANGE;
> +               return -ERANGE;
> +       }
> +       gen->insn_start = realloc(gen->insn_start, off + size);

leaking memory here: gen->insn_start will be NULL on failure

> +       if (!gen->insn_start) {
> +               gen->error = -ENOMEM;
> +               return -ENOMEM;
> +       }
> +       gen->insn_cur = gen->insn_start + off;
> +       return 0;
> +}
> +
> +static int bpf_gen__realloc_data_buf(struct bpf_gen *gen, __u32 size)
> +{
> +       size_t off = gen->data_cur - gen->data_start;
> +
> +       if (gen->error)
> +               return gen->error;
> +       if (size > INT32_MAX || off + size > INT32_MAX) {
> +               gen->error = -ERANGE;
> +               return -ERANGE;
> +       }
> +       gen->data_start = realloc(gen->data_start, off + size);

same as above

> +       if (!gen->data_start) {
> +               gen->error = -ENOMEM;
> +               return -ENOMEM;
> +       }
> +       gen->data_cur = gen->data_start + off;
> +       return 0;
> +}
> +

[...]

> +
> +static void bpf_gen__emit_sys_bpf(struct bpf_gen *gen, int cmd, int attr, int attr_size)
> +{
> +       bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_1, cmd));
> +       bpf_gen__emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_2, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, attr));

is attr an offset into a blob? if yes, attr_off? or attr_base_off,
anything with _off

> +       bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_3, attr_size));
> +       bpf_gen__emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_bpf));
> +       /* remember the result in R7 */
> +       bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0));
> +}
> +
> +static void bpf_gen__emit_check_err(struct bpf_gen *gen)
> +{
> +       bpf_gen__emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 2));
> +       bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_0, BPF_REG_7));
> +       bpf_gen__emit(gen, BPF_EXIT_INSN());
> +}
> +
> +static void __bpf_gen__debug(struct bpf_gen *gen, int reg1, int reg2, const char *fmt, va_list args)

Can you please leave a comment on what reg1 and reg2 is, it's not very
clear and the code clearly assumes that it can't be reg[1-4]. It's
probably those special R7 and R9 (or -1, of course), but having a
short comment makes sense to not jump around trying to figure out
possible inputs.

Oh, reading further, it can also be R0.

> +{
> +       char buf[1024];
> +       int addr, len, ret;
> +
> +       if (!gen->log_level)
> +               return;
> +       ret = vsnprintf(buf, sizeof(buf), fmt, args);
> +       if (ret < 1024 - 7 && reg1 >= 0 && reg2 < 0)
> +               /* The special case to accommodate common bpf_gen__debug_ret():
> +                * to avoid specifying BPF_REG_7 and adding " r=%%d" to prints explicitly.
> +                */
> +               strcat(buf, " r=%d");
> +       len = strlen(buf) + 1;
> +       addr = bpf_gen__add_data(gen, buf, len);

nit: offset, not address, right?

> +
> +       bpf_gen__emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, addr));
> +       bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_2, len));
> +       if (reg1 >= 0)
> +               bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_3, reg1));
> +       if (reg2 >= 0)
> +               bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_4, reg2));
> +       bpf_gen__emit(gen, BPF_EMIT_CALL(BPF_FUNC_trace_printk));
> +}
> +

[...]

> +int bpf_gen__finish(struct bpf_gen *gen)
> +{
> +       int i;
> +
> +       bpf_gen__emit_sys_close_stack(gen, stack_off(btf_fd));
> +       for (i = 0; i < gen->nr_progs; i++)
> +               bpf_gen__move_stack2ctx(gen,
> +                                       sizeof(struct bpf_loader_ctx) +
> +                                       sizeof(struct bpf_map_desc) * gen->nr_maps +
> +                                       sizeof(struct bpf_prog_desc) * i +
> +                                       offsetof(struct bpf_prog_desc, prog_fd), 4,
> +                                       stack_off(prog_fd[i]));
> +       for (i = 0; i < gen->nr_maps; i++)
> +               bpf_gen__move_stack2ctx(gen,
> +                                       sizeof(struct bpf_loader_ctx) +
> +                                       sizeof(struct bpf_map_desc) * i +
> +                                       offsetof(struct bpf_map_desc, map_fd), 4,
> +                                       stack_off(map_fd[i]));
> +       bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0));
> +       bpf_gen__emit(gen, BPF_EXIT_INSN());
> +       pr_debug("bpf_gen__finish %d\n", gen->error);

maybe prefix all those pr_debug()s with "gen: " to distinguish them
from the rest of libbpf logging?

> +       if (!gen->error) {
> +               struct gen_loader_opts *opts = gen->opts;
> +
> +               opts->insns = gen->insn_start;
> +               opts->insns_sz = gen->insn_cur - gen->insn_start;
> +               opts->data = gen->data_start;
> +               opts->data_sz = gen->data_cur - gen->data_start;
> +       }
> +       return gen->error;
> +}
> +
> +void bpf_gen__free(struct bpf_gen *gen)
> +{
> +       if (!gen)
> +               return;
> +       free(gen->data_start);
> +       free(gen->insn_start);
> +       gen->data_start = NULL;
> +       gen->insn_start = NULL;

what's the point of NULL'ing them out if you don't clear gen->data_cur
and gen->insn_cur?

also should it free(gen) itself?

> +}
> +
> +void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data, __u32 btf_raw_size)
> +{
> +       union bpf_attr attr = {};

here and below: memset(0)?

> +       int attr_size = offsetofend(union bpf_attr, btf_log_level);
> +       int btf_data, btf_load_attr;
> +
> +       pr_debug("btf_load: size %d\n", btf_raw_size);
> +       btf_data = bpf_gen__add_data(gen, btf_raw_data, btf_raw_size);
> +

[...]

> +       map_create_attr = bpf_gen__add_data(gen, &attr, attr_size);
> +       if (attr.btf_value_type_id)
> +               /* populate union bpf_attr with btf_fd saved in the stack earlier */
> +               bpf_gen__move_stack2blob(gen, map_create_attr + offsetof(union bpf_attr, btf_fd), 4,
> +                                        stack_off(btf_fd));
> +       switch (attr.map_type) {
> +       case BPF_MAP_TYPE_ARRAY_OF_MAPS:
> +       case BPF_MAP_TYPE_HASH_OF_MAPS:
> +               bpf_gen__move_stack2blob(gen, map_create_attr + offsetof(union bpf_attr, inner_map_fd),
> +                                        4, stack_off(inner_map_fd));
> +               close_inner_map_fd = true;
> +               break;
> +       default:;

default:
    break;

> +       }
> +       /* emit MAP_CREATE command */
> +       bpf_gen__emit_sys_bpf(gen, BPF_MAP_CREATE, map_create_attr, attr_size);
> +       bpf_gen__debug_ret(gen, "map_create %s idx %d type %d value_size %d",
> +                          attr.map_name, map_idx, map_attr->map_type, attr.value_size);
> +       bpf_gen__emit_check_err(gen);

what will happen on error with inner_map_fd and all the other fds
created by now?

> +       /* remember map_fd in the stack, if successful */
> +       if (map_idx < 0) {
> +               /* This bpf_gen__map_create() function is called with map_idx >= 0 for all maps
> +                * that libbpf loading logic tracks.
> +                * It's called with -1 to create an inner map.
> +                */
> +               bpf_gen__emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, stack_off(inner_map_fd)));
> +       } else {
> +               if (map_idx != gen->nr_maps) {

why would that happen? defensive programming? and even then `if () {}
else if () {} else {}` structure is more appropriate

> +                       gen->error = -EDOM; /* internal bug */
> +                       return;
> +               }
> +               bpf_gen__emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, stack_off(map_fd[map_idx])));
> +               gen->nr_maps++;
> +       }
> +       if (close_inner_map_fd)
> +               bpf_gen__emit_sys_close_stack(gen, stack_off(inner_map_fd));
> +}
> +

[...]

> +static void bpf_gen__cleanup_relos(struct bpf_gen *gen, int insns)
> +{
> +       int i, insn;
> +
> +       for (i = 0; i < gen->relo_cnt; i++) {
> +               if (gen->relos[i].kind != BTF_KIND_VAR)
> +                       continue;
> +               /* close fd recorded in insn[insn_idx + 1].imm */
> +               insn = insns + sizeof(struct bpf_insn) * (gen->relos[i].insn_idx + 1)
> +                       + offsetof(struct bpf_insn, imm);
> +               bpf_gen__emit_sys_close_blob(gen, insn);

wouldn't this close the same FD used across multiple "relos" multiple times?

> +       }
> +       if (gen->relo_cnt) {
> +               free(gen->relos);
> +               gen->relo_cnt = 0;
> +               gen->relos = NULL;
> +       }
> +}
> +

[...]

> +       struct bpf_gen *gen_loader;
> +
>         /*
>          * Information when doing elf related work. Only valid if fd
>          * is valid.
> @@ -2651,7 +2654,15 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj)
>                 bpf_object__sanitize_btf(obj, kern_btf);
>         }
>
> -       err = btf__load(kern_btf);
> +       if (obj->gen_loader) {
> +               __u32 raw_size = 0;
> +               const void *raw_data = btf__get_raw_data(kern_btf, &raw_size);

this can return NULL on ENOMEM

> +
> +               bpf_gen__load_btf(obj->gen_loader, raw_data, raw_size);
> +               btf__set_fd(kern_btf, 0);

why setting fd to 0 (stdin)? does gen depend on this somewhere? The
problem is that it will eventually be closed on btf__free(), which
will close stdin, causing a big surprise. What will happen if you
leave it at -1?

> +       } else {
> +               err = btf__load(kern_btf);
> +       }
>         if (sanitize) {
>                 if (!err) {
>                         /* move fd to libbpf's BTF */
> @@ -4262,6 +4273,12 @@ static bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id f
>         struct kern_feature_desc *feat = &feature_probes[feat_id];
>         int ret;
>
> +       if (obj->gen_loader)
> +               /* To generate loader program assume the latest kernel
> +                * to avoid doing extra prog_load, map_create syscalls.
> +                */
> +               return true;
> +
>         if (READ_ONCE(feat->res) == FEAT_UNKNOWN) {
>                 ret = feat->probe();
>                 if (ret > 0) {
> @@ -4344,6 +4361,13 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map)
>         char *cp, errmsg[STRERR_BUFSIZE];
>         int err, zero = 0;
>
> +       if (obj->gen_loader) {
> +               bpf_gen__map_update_elem(obj->gen_loader, map - obj->maps,

it would be great for bpf_gen__map_update_elem to reflect that it's
not a generic map_update_elem() call, rather special internal map
update (just use bpf_gen__populate_internal_map?) Whether to freeze or
not could be just a flag to the same call, they always go together.

> +                                        map->mmaped, map->def.value_size);
> +               if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_KCONFIG)
> +                       bpf_gen__map_freeze(obj->gen_loader, map - obj->maps);
> +               return 0;
> +       }
>         err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0);
>         if (err) {
>                 err = -errno;
> @@ -4369,7 +4393,7 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map)
>
>  static void bpf_map__destroy(struct bpf_map *map);
>
> -static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map)
> +static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, bool is_inner)
>  {
>         struct bpf_create_map_attr create_attr;
>         struct bpf_map_def *def = &map->def;
> @@ -4415,9 +4439,9 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map)
>
>         if (bpf_map_type__is_map_in_map(def->type)) {
>                 if (map->inner_map) {
> -                       int err;
> +                       int err = 0;

no need to initialize to zero, you are assigning it right below

>
> -                       err = bpf_object__create_map(obj, map->inner_map);
> +                       err = bpf_object__create_map(obj, map->inner_map, true);
>                         if (err) {
>                                 pr_warn("map '%s': failed to create inner map: %d\n",
>                                         map->name, err);

[...]

> @@ -4469,7 +4498,12 @@ static int init_map_slots(struct bpf_map *map)
>
>                 targ_map = map->init_slots[i];
>                 fd = bpf_map__fd(targ_map);
> -               err = bpf_map_update_elem(map->fd, &i, &fd, 0);
> +               if (obj->gen_loader) {
> +                       printf("// TODO map_update_elem: idx %ld key %d value==map_idx %ld\n",
> +                              map - obj->maps, i, targ_map - obj->maps);

return error for now?

> +               } else {
> +                       err = bpf_map_update_elem(map->fd, &i, &fd, 0);
> +               }
>                 if (err) {
>                         err = -errno;
>                         pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n",

[...]

> @@ -6082,6 +6119,11 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
>         if (str_is_empty(spec_str))
>                 return -EINVAL;
>
> +       if (prog->obj->gen_loader) {
> +               printf("// TODO core_relo: prog %ld insn[%d] %s %s kind %d\n",
> +                      prog - prog->obj->programs, relo->insn_off / 8,
> +                      local_name, spec_str, relo->kind);

same, return error? Drop printf, maybe leave pr_debug()?

> +       }
>         err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec);
>         if (err) {
>                 pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n",
> @@ -6821,6 +6863,19 @@ bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog)
>
>         return 0;
>  }

empty line here

> +static void
> +bpf_object__free_relocs(struct bpf_object *obj)
> +{
> +       struct bpf_program *prog;
> +       int i;
> +
> +       /* free up relocation descriptors */
> +       for (i = 0; i < obj->nr_programs; i++) {
> +               prog = &obj->programs[i];
> +               zfree(&prog->reloc_desc);
> +               prog->nr_reloc = 0;
> +       }
> +}
>

[...]

> +static int bpf_program__record_externs(struct bpf_program *prog)
> +{
> +       struct bpf_object *obj = prog->obj;
> +       int i;
> +
> +       for (i = 0; i < prog->nr_reloc; i++) {
> +               struct reloc_desc *relo = &prog->reloc_desc[i];
> +               struct extern_desc *ext = &obj->externs[relo->sym_off];
> +
> +               switch (relo->type) {
> +               case RELO_EXTERN_VAR:
> +                       if (ext->type != EXT_KSYM)
> +                               continue;
> +                       if (!ext->ksym.type_id) /* typeless ksym */
> +                               continue;

this shouldn't be silently ignored, if it's not supported, it should
return error

> +                       bpf_gen__record_extern(obj->gen_loader, ext->name, BTF_KIND_VAR,
> +                                              relo->insn_idx);
> +                       break;
> +               case RELO_EXTERN_FUNC:
> +                       bpf_gen__record_extern(obj->gen_loader, ext->name, BTF_KIND_FUNC,
> +                                              relo->insn_idx);
> +                       break;
> +               default:
> +                       continue;
> +               }
> +       }
> +       return 0;
> +}
> +

[...]

> @@ -7868,6 +7970,9 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
>         err = err ? : bpf_object__relocate(obj, attr->target_btf_path);
>         err = err ? : bpf_object__load_progs(obj, attr->log_level);
>
> +       if (obj->gen_loader && !err)
> +               err = bpf_gen__finish(obj->gen_loader);
> +
>         /* clean up module BTFs */
>         for (i = 0; i < obj->btf_module_cnt; i++) {
>                 close(obj->btf_modules[i].fd);
> @@ -8493,6 +8598,7 @@ void bpf_object__close(struct bpf_object *obj)
>         if (obj->clear_priv)
>                 obj->clear_priv(obj, obj->priv);

bpf_object__close() will close all those FD=0 in maps/progs, that's not good

>
> +       bpf_gen__free(obj->gen_loader);
>         bpf_object__elf_finish(obj);
>         bpf_object__unload(obj);
>         btf__free(obj->btf);

[...]

> @@ -9387,7 +9521,13 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, int *btf_obj_fd,
>         }
>
>         /* kernel/module BTF ID */
> -       err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id);
> +       if (prog->obj->gen_loader) {
> +               bpf_gen__record_attach_target(prog->obj->gen_loader, attach_name, attach_type);
> +               *btf_obj_fd = 0;

this will leak kernel module BTF FDs

> +               *btf_type_id = 1;
> +       } else {
> +               err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id);
> +       }
>         if (err) {
>                 pr_warn("failed to find kernel BTF type ID of '%s': %d\n", attach_name, err);
>                 return err;

[...]

> +out:
> +       close(map_fd);
> +       close(prog_fd);

this does close(-1), check >= 0

> +       return err;
> +}
> +
> +#endif
> --
> 2.30.2
>