Re: [PATCH v4 bpf-next 07/16] bpf: Add bpf_core_add_cands() and wire it into bpf_core_apply_relo_insn().

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Mon, 29 Nov 2021 17:03:37 -0800

On Tue, Nov 23, 2021 at 10:02 PM Alexei Starovoitov
<alexei.starovoitov@xxxxxxxxx> wrote:
>
> From: Alexei Starovoitov <ast@xxxxxxxxxx>
>
> Given BPF program's BTF root type name perform the following steps:
> . search in vmlinux candidate cache.
> . if (present in cache and candidate list >= 1) return candidate list.
> . do a linear search through kernel BTFs for possible candidates.
> . regardless of number of candidates found populate vmlinux cache.
> . if (candidate list >= 1) return candidate list.
> . search in module candidate cache.
> . if (present in cache) return candidate list (even if list is empty).
> . do a linear search through BTFs of all kernel modules
>   collecting candidates from all of them.
> . regardless of number of candidates found populate module cache.
> . return candidate list.
> Then wire the result into bpf_core_apply_relo_insn().
>
> When BPF program is trying to CO-RE relocate a type
> that doesn't exist in either vmlinux BTF or in modules BTFs
> these steps will perform 2 cache lookups when cache is hit.
>
> Note the cache doesn't prevent the abuse by the program that might
> have lots of relocations that cannot be resolved. Hence cond_resched().
>
> CO-RE in the kernel requires CAP_BPF, since BTF loading requires it.
>
> Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
> ---
>  kernel/bpf/btf.c          | 250 +++++++++++++++++++++++++++++++++++++-
>  tools/lib/bpf/relo_core.h |   2 +
>  2 files changed, 251 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index dbf1f389b1d3..cf971b8a0769 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -25,6 +25,7 @@
>  #include <linux/kobject.h>
>  #include <linux/sysfs.h>
>  #include <net/sock.h>
> +#include "../tools/lib/bpf/relo_core.h"
>

[...]

> +static void populate_cand_cache(struct bpf_cand_cache *cands,
> +                               struct bpf_cand_cache **cache,
> +                               int cache_size)
> +{
> +       u32 hash = jhash_2words(cands->name_len,
> +                               (((u32) cands->name[0]) << 8) | cands->name[1], 0);

maybe add a helper func to calculate the hash given struct
bpf_cand_cache to keep the logic always in sync?

> +       struct bpf_cand_cache *cc = cache[hash % cache_size];
> +
> +       if (cc)
> +               bpf_free_cands(cc);
> +       cache[hash % cache_size] = cands;
> +}
> +

[...]

> +static struct bpf_cand_cache *
> +bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
> +{
> +       const struct btf *local_btf = ctx->btf;
> +       const struct btf_type *local_type;
> +       const struct btf *main_btf;
> +       size_t local_essent_len;
> +       struct bpf_cand_cache *cands, *cc;
> +       struct btf *mod_btf;
> +       const char *name;
> +       int id;
> +
> +       local_type = btf_type_by_id(local_btf, local_type_id);
> +       if (!local_type)
> +               return ERR_PTR(-EINVAL);
> +
> +       name = btf_name_by_offset(local_btf, local_type->name_off);
> +       if (str_is_empty(name))
> +               return ERR_PTR(-EINVAL);
> +       local_essent_len = bpf_core_essential_name_len(name);
> +
> +       cands = kcalloc(1, sizeof(*cands), GFP_KERNEL);
> +       if (!cands)
> +               return ERR_PTR(-ENOMEM);
> +       cands->name = kmemdup_nul(name, local_essent_len, GFP_KERNEL);

it's pretty minor, but you don't really need kmemdup_nul() until
populate_cand_cache(), you can use name as is until you really need to
cache cands

> +       if (!cands->name) {
> +               kfree(cands);
> +               return ERR_PTR(-ENOMEM);
> +       }
> +       cands->kind = btf_kind(local_type);
> +       cands->name_len = local_essent_len;
> +
> +       cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
> +       if (cc) {
> +               if (cc->cnt) {
> +                       bpf_free_cands(cands);
> +                       return cc;
> +               }
> +               goto check_modules;
> +       }
> +
> +       /* Attempt to find target candidates in vmlinux BTF first */
> +       main_btf = bpf_get_btf_vmlinux();
> +       cands = bpf_core_add_cands(cands, main_btf, 1);
> +       if (IS_ERR(cands))
> +               return cands;
> +
> +       /* populate cache even when cands->cnt == 0 */
> +       populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
> +
> +       /* if vmlinux BTF has any candidate, don't go for module BTFs */
> +       if (cands->cnt)
> +               return cands;
> +
> +check_modules:
> +       cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
> +       if (cc) {
> +               bpf_free_cands(cands);
> +               /* if cache has it return it even if cc->cnt == 0 */
> +               return cc;
> +       }
> +
> +       /* If candidate is not found in vmlinux's BTF then search in module's BTFs */
> +       spin_lock_bh(&btf_idr_lock);
> +       idr_for_each_entry(&btf_idr, mod_btf, id) {
> +               if (!btf_is_module(mod_btf))
> +                       continue;
> +               /* linear search could be slow hence unlock/lock
> +                * the IDR to avoiding holding it for too long
> +                */
> +               btf_get(mod_btf);
> +               spin_unlock_bh(&btf_idr_lock);
> +               cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
> +               if (IS_ERR(cands)) {
> +                       btf_put(mod_btf);
> +                       return cands;
> +               }
> +               spin_lock_bh(&btf_idr_lock);
> +               btf_put(mod_btf);

either need to additionally btf_get(mod_btf) inside
bpf_core_add_cands() not btf_put() it here if you added at least one
candidate, as you are storing targ_btf inside bpf_core_add_cands() and
dropping refcount might leave dangling pointer

> +       }
> +       spin_unlock_bh(&btf_idr_lock);
> +       /* populate cache even when cands->cnt == 0 */
> +       populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
> +       return cands;
> +}
> +
>  int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
>                    int relo_idx, void *insn)
>  {
> -       return -EOPNOTSUPP;
> +       struct bpf_core_cand_list cands = {};
> +       int err;
> +
> +       if (relo->kind != BPF_CORE_TYPE_ID_LOCAL) {
> +               struct bpf_cand_cache *cc;
> +               int i;
> +
> +               mutex_lock(&cand_cache_mutex);
> +               cc = bpf_core_find_cands(ctx, relo->type_id);
> +               if (IS_ERR(cc)) {
> +                       bpf_log(ctx->log, "target candidate search failed for %d\n",
> +                               relo->type_id);
> +                       return PTR_ERR(cc);
> +               }
> +               if (cc->cnt) {
> +                       cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
> +                       if (!cands.cands)
> +                               return -ENOMEM;
> +               }
> +               for (i = 0; i < cc->cnt; i++) {
> +                       bpf_log(ctx->log,
> +                               "CO-RE relocating %s %s: found target candidate [%d]\n",
> +                               btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
> +                       cands.cands[i].btf = cc->cands[i].btf;
> +                       cands.cands[i].id = cc->cands[i].id;
> +               }
> +               cands.len = cc->cnt;
> +               mutex_unlock(&cand_cache_mutex);
> +       }
> +

cache is not locked at this point, so those cands.cands[i].btf objects
might be freed now (if module got unloaded meanwhile), right?

This global sharing of that small cache seems to cause unnecessary
headaches, tbh. It adds global mutex (which might also block for
kcalloc). If you used that cache locally for processing single
bpf_prog, you wouldn't need the locking. It can probably also simplify
the refcounting, especially if you just btf_get(targ_btf) for each
candidate and then btf_put() it after all relos are processed. You are
also half-step away from removing the size restriction (just chain
bpf_cand_caches together) and having a fixed bucket-size hash with
non-fixed chain length (which probably would be totally fine for all
practical purposes).

> +       err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
> +                                      relo, relo_idx, ctx->btf, &cands);
> +       kfree(cands.cands);
> +       return err;
>  }
> diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h
> index f410691cc4e5..f7b0d698978c 100644
> --- a/tools/lib/bpf/relo_core.h
> +++ b/tools/lib/bpf/relo_core.h
> @@ -8,8 +8,10 @@
>
>  struct bpf_core_cand {
>         const struct btf *btf;
> +#ifndef __KERNEL__
>         const struct btf_type *t;
>         const char *name;
> +#endif

why? doesn't seem to be used and both t and name could be derived from
btf and id

>         __u32 id;
>  };
>
> --
> 2.30.2
>