Re: [PATCH v4 bpf-next 07/16] bpf: Add bpf_core_add_cands() and wire it into bpf_core_apply_relo_insn().

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Nov 29, 2021 at 7:18 PM Alexei Starovoitov
<alexei.starovoitov@xxxxxxxxx> wrote:
>
> On Mon, Nov 29, 2021 at 05:03:37PM -0800, Andrii Nakryiko wrote:
> > On Tue, Nov 23, 2021 at 10:02 PM Alexei Starovoitov
> > <alexei.starovoitov@xxxxxxxxx> wrote:
> > >
> > > From: Alexei Starovoitov <ast@xxxxxxxxxx>
> > >
> > > Given BPF program's BTF root type name perform the following steps:
> > > . search in vmlinux candidate cache.
> > > . if (present in cache and candidate list >= 1) return candidate list.
> > > . do a linear search through kernel BTFs for possible candidates.
> > > . regardless of number of candidates found populate vmlinux cache.
> > > . if (candidate list >= 1) return candidate list.
> > > . search in module candidate cache.
> > > . if (present in cache) return candidate list (even if list is empty).
> > > . do a linear search through BTFs of all kernel modules
> > >   collecting candidates from all of them.
> > > . regardless of number of candidates found populate module cache.
> > > . return candidate list.
> > > Then wire the result into bpf_core_apply_relo_insn().
> > >
> > > When BPF program is trying to CO-RE relocate a type
> > > that doesn't exist in either vmlinux BTF or in modules BTFs
> > > these steps will perform 2 cache lookups when cache is hit.
> > >
> > > Note the cache doesn't prevent the abuse by the program that might
> > > have lots of relocations that cannot be resolved. Hence cond_resched().
> > >
> > > CO-RE in the kernel requires CAP_BPF, since BTF loading requires it.
> > >
> > > Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
> > > ---
> > >  kernel/bpf/btf.c          | 250 +++++++++++++++++++++++++++++++++++++-
> > >  tools/lib/bpf/relo_core.h |   2 +
> > >  2 files changed, 251 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > > index dbf1f389b1d3..cf971b8a0769 100644
> > > --- a/kernel/bpf/btf.c
> > > +++ b/kernel/bpf/btf.c
> > > @@ -25,6 +25,7 @@
> > >  #include <linux/kobject.h>
> > >  #include <linux/sysfs.h>
> > >  #include <net/sock.h>
> > > +#include "../tools/lib/bpf/relo_core.h"
> > >
> >
> > [...]
> >
> > > +static void populate_cand_cache(struct bpf_cand_cache *cands,
> > > +                               struct bpf_cand_cache **cache,
> > > +                               int cache_size)
> > > +{
> > > +       u32 hash = jhash_2words(cands->name_len,
> > > +                               (((u32) cands->name[0]) << 8) | cands->name[1], 0);
> >
> > maybe add a helper func to calculate the hash given struct
> > bpf_cand_cache to keep the logic always in sync?
>
> I felt it's trivial enough, but sure I can do that.
>
> > > +       struct bpf_cand_cache *cc = cache[hash % cache_size];
> > > +
> > > +       if (cc)
> > > +               bpf_free_cands(cc);
> > > +       cache[hash % cache_size] = cands;
> > > +}
> > > +
> >
> > [...]
> >
> > > +static struct bpf_cand_cache *
> > > +bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
> > > +{
> > > +       const struct btf *local_btf = ctx->btf;
> > > +       const struct btf_type *local_type;
> > > +       const struct btf *main_btf;
> > > +       size_t local_essent_len;
> > > +       struct bpf_cand_cache *cands, *cc;
> > > +       struct btf *mod_btf;
> > > +       const char *name;
> > > +       int id;
> > > +
> > > +       local_type = btf_type_by_id(local_btf, local_type_id);
> > > +       if (!local_type)
> > > +               return ERR_PTR(-EINVAL);
> > > +
> > > +       name = btf_name_by_offset(local_btf, local_type->name_off);
> > > +       if (str_is_empty(name))
> > > +               return ERR_PTR(-EINVAL);
> > > +       local_essent_len = bpf_core_essential_name_len(name);
> > > +
> > > +       cands = kcalloc(1, sizeof(*cands), GFP_KERNEL);
> > > +       if (!cands)
> > > +               return ERR_PTR(-ENOMEM);
> > > +       cands->name = kmemdup_nul(name, local_essent_len, GFP_KERNEL);
> >
> > it's pretty minor, but you don't really need kmemdup_nul() until
> > populate_cand_cache(), you can use name as is until you really need to
> > cache cands
>
> I thought about it, but didn't do it, because it complicates the code:
> bpf_core_add_cands() would somehow need to differentiate
> whether cands->name came as a 1st call into bpf_core_add_cands()
> or it's a subsequent call.

ah, I initially missed that bpf_core_add_cans() can also free
candidate list; yeah, it's fine, as I said minor

> It could be a flag in struct bpf_cand_cache that will tell
> whether bpf_free_cands() needs to be freed or not, but feels
> unnecessary complex.

right, flags suck

>
> >
> > > +       if (!cands->name) {
> > > +               kfree(cands);
> > > +               return ERR_PTR(-ENOMEM);
> > > +       }
> > > +       cands->kind = btf_kind(local_type);
> > > +       cands->name_len = local_essent_len;
> > > +
> > > +       cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
> > > +       if (cc) {
> > > +               if (cc->cnt) {
> > > +                       bpf_free_cands(cands);
> > > +                       return cc;
> > > +               }
> > > +               goto check_modules;
> > > +       }
> > > +
> > > +       /* Attempt to find target candidates in vmlinux BTF first */
> > > +       main_btf = bpf_get_btf_vmlinux();
> > > +       cands = bpf_core_add_cands(cands, main_btf, 1);
> > > +       if (IS_ERR(cands))
> > > +               return cands;
> > > +
> > > +       /* populate cache even when cands->cnt == 0 */
> > > +       populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
> > > +
> > > +       /* if vmlinux BTF has any candidate, don't go for module BTFs */
> > > +       if (cands->cnt)
> > > +               return cands;
> > > +
> > > +check_modules:
> > > +       cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
> > > +       if (cc) {
> > > +               bpf_free_cands(cands);
> > > +               /* if cache has it return it even if cc->cnt == 0 */
> > > +               return cc;
> > > +       }
> > > +
> > > +       /* If candidate is not found in vmlinux's BTF then search in module's BTFs */
> > > +       spin_lock_bh(&btf_idr_lock);
> > > +       idr_for_each_entry(&btf_idr, mod_btf, id) {
> > > +               if (!btf_is_module(mod_btf))
> > > +                       continue;
> > > +               /* linear search could be slow hence unlock/lock
> > > +                * the IDR to avoiding holding it for too long
> > > +                */
> > > +               btf_get(mod_btf);
> > > +               spin_unlock_bh(&btf_idr_lock);
> > > +               cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
> > > +               if (IS_ERR(cands)) {
> > > +                       btf_put(mod_btf);
> > > +                       return cands;
> > > +               }
> > > +               spin_lock_bh(&btf_idr_lock);
> > > +               btf_put(mod_btf);
> >
> > either need to additionally btf_get(mod_btf) inside
> > bpf_core_add_cands() not btf_put() it here if you added at least one
> > candidate, as you are storing targ_btf inside bpf_core_add_cands() and
> > dropping refcount might leave dangling pointer
>
> Module will not go away while cands are being searched and cache ops are done.
> purge_cand_cache() is called from MODULE_STATE_GOING.
> I've considered doing the purge from btf_put(),
> but we don't guarantee the context in there, so mutex_lock
> would complicate btf_put too much.
> It's simpler to do purge from MODULE_STATE_GOING.
> But more below...
>
> > > +               for (i = 0; i < cc->cnt; i++) {
> > > +                       bpf_log(ctx->log,
> > > +                               "CO-RE relocating %s %s: found target candidate [%d]\n",
> > > +                               btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
> > > +                       cands.cands[i].btf = cc->cands[i].btf;
> > > +                       cands.cands[i].id = cc->cands[i].id;
> > > +               }
> > > +               cands.len = cc->cnt;
> > > +               mutex_unlock(&cand_cache_mutex);
> > > +       }
> > > +
> >
> > cache is not locked at this point, so those cands.cands[i].btf objects
> > might be freed now (if module got unloaded meanwhile), right?
>
> right, looks easier to do btf_get here while copying the pointer.
> and do a loop of btf_put after bpf_core_apply_relo_insn.
> Doing btf_get inside bpf_core_add_cands adds complexity to cache replacement
> and purging.
> Right now __purge_cand_cache is just kfree of the whole slot
> with multiple btf pointers in there potentially from different modules.
> With btf_get in add_cands it would need to be a loop and
> bpf_free_cands would need to have a loop.
> btw the earlier versions of this patch set had the same issue,
> so thanks for the good catch!

no prolem. Yeah, btf_get() here should work as well, I think.
>
> > This global sharing of that small cache seems to cause unnecessary
> > headaches, tbh. It adds global mutex (which might also block for
> > kcalloc). If you used that cache locally for processing single
> > bpf_prog, you wouldn't need the locking. It can probably also simplify
> > the refcounting, especially if you just btf_get(targ_btf) for each
> > candidate and then btf_put() it after all relos are processed. You are
> > also half-step away from removing the size restriction (just chain
> > bpf_cand_caches together) and having a fixed bucket-size hash with
> > non-fixed chain length (which probably would be totally fine for all
> > practical purposes).
>
> and that would be a almost done hashtable? why add that complexity?

well, my point was that you already have all this complexity :) I see
avoiding global mutex as less complexity, hashmap parts are just
annoying and mundate code, but not really a complexity.

But if you insist on a shared global mini-cache, that's fine.

> The size restriction is necessary anyway for a global cache.
> Even for per-program hashtable some size restriction might be
> necessary. CAP_BPF is a target for "researchers" to do
> weird things with the kernel.
>
> >
> >
> > > +       err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
> > > +                                      relo, relo_idx, ctx->btf, &cands);
> > > +       kfree(cands.cands);
> > > +       return err;
> > >  }
> > > diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h
> > > index f410691cc4e5..f7b0d698978c 100644
> > > --- a/tools/lib/bpf/relo_core.h
> > > +++ b/tools/lib/bpf/relo_core.h
> > > @@ -8,8 +8,10 @@
> > >
> > >  struct bpf_core_cand {
> > >         const struct btf *btf;
> > > +#ifndef __KERNEL__
> > >         const struct btf_type *t;
> > >         const char *name;
> > > +#endif
> >
> > why? doesn't seem to be used and both t and name could be derived from
> > btf and id
>
> exactly, that's why CO-RE in the kernel doesn't use them,
> but libbpf uses both for debugging and for passing information
> back and forth between layers.

oh, I thought you added those fields initially and forgot to delete or
something, didn't notice that you are just "opting them out" for
__KERNEL__. I think libbpf code doesn't strictly need this, here's the
diff that completely removes their use, it's pretty straightforward
and minimal, so maybe instead of #ifdef'ing let's just do that?

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b59fede08ba7..95fa57eea289 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -5179,15 +5179,18 @@ static int bpf_core_add_cands(struct
bpf_core_cand *local_cand,
                   struct bpf_core_cand_list *cands)
 {
     struct bpf_core_cand *new_cands, *cand;
-    const struct btf_type *t;
-    const char *targ_name;
+    const struct btf_type *t, *local_t;
+    const char *targ_name, *local_name;
     size_t targ_essent_len;
     int n, i;

+    local_t = btf__type_by_id(local_cand->btf, local_cand->id);
+    local_name = btf__str_by_offset(local_cand->btf, local_t->name_off);
+
     n = btf__type_cnt(targ_btf);
     for (i = targ_start_id; i < n; i++) {
         t = btf__type_by_id(targ_btf, i);
-        if (btf_kind(t) != btf_kind(local_cand->t))
+        if (btf_kind(t) != btf_kind(local_t))
             continue;

         targ_name = btf__name_by_offset(targ_btf, t->name_off);
@@ -5198,12 +5201,12 @@ static int bpf_core_add_cands(struct
bpf_core_cand *local_cand,
         if (targ_essent_len != local_essent_len)
             continue;

-        if (strncmp(local_cand->name, targ_name, local_essent_len) != 0)
+        if (strncmp(local_name, targ_name, local_essent_len) != 0)
             continue;

         pr_debug("CO-RE relocating [%d] %s %s: found target candidate
[%d] %s %s in [%s]\n",
-             local_cand->id, btf_kind_str(local_cand->t),
-             local_cand->name, i, btf_kind_str(t), targ_name,
+             local_cand->id, btf_kind_str(local_t),
+             local_name, i, btf_kind_str(t), targ_name,
              targ_btf_name);
         new_cands = libbpf_reallocarray(cands->cands, cands->len + 1,
                           sizeof(*cands->cands));
@@ -5212,8 +5215,6 @@ static int bpf_core_add_cands(struct
bpf_core_cand *local_cand,

         cand = &new_cands[cands->len];
         cand->btf = targ_btf;
-        cand->t = t;
-        cand->name = targ_name;
         cand->id = i;

         cands->cands = new_cands;
@@ -5320,18 +5321,20 @@ bpf_core_find_cands(struct bpf_object *obj,
const struct btf *local_btf, __u32 l
     struct bpf_core_cand local_cand = {};
     struct bpf_core_cand_list *cands;
     const struct btf *main_btf;
+    const struct btf_type *local_t;
+    const char *local_name;
     size_t local_essent_len;
     int err, i;

     local_cand.btf = local_btf;
-    local_cand.t = btf__type_by_id(local_btf, local_type_id);
-    if (!local_cand.t)
+    local_t = btf__type_by_id(local_btf, local_type_id);
+    if (!local_t)
         return ERR_PTR(-EINVAL);

-    local_cand.name = btf__name_by_offset(local_btf, local_cand.t->name_off);
-    if (str_is_empty(local_cand.name))
+    local_name = btf__name_by_offset(local_btf, local_t->name_off);
+    if (str_is_empty(local_name))
         return ERR_PTR(-EINVAL);
-    local_essent_len = bpf_core_essential_name_len(local_cand.name);
+    local_essent_len = bpf_core_essential_name_len(local_name);

     cands = calloc(1, sizeof(*cands));
     if (!cands)
diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h
index 3b9f8f18346c..0dc0b9256bea 100644
--- a/tools/lib/bpf/relo_core.h
+++ b/tools/lib/bpf/relo_core.h
@@ -77,8 +77,6 @@ struct bpf_core_relo {

 struct bpf_core_cand {
     const struct btf *btf;
-    const struct btf_type *t;
-    const char *name;
     __u32 id;
 };



[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux