On Mon, Nov 29, 2021 at 7:18 PM Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> wrote: > > On Mon, Nov 29, 2021 at 05:03:37PM -0800, Andrii Nakryiko wrote: > > On Tue, Nov 23, 2021 at 10:02 PM Alexei Starovoitov > > <alexei.starovoitov@xxxxxxxxx> wrote: > > > > > > From: Alexei Starovoitov <ast@xxxxxxxxxx> > > > > > > Given BPF program's BTF root type name perform the following steps: > > > . search in vmlinux candidate cache. > > > . if (present in cache and candidate list >= 1) return candidate list. > > > . do a linear search through kernel BTFs for possible candidates. > > > . regardless of number of candidates found populate vmlinux cache. > > > . if (candidate list >= 1) return candidate list. > > > . search in module candidate cache. > > > . if (present in cache) return candidate list (even if list is empty). > > > . do a linear search through BTFs of all kernel modules > > > collecting candidates from all of them. > > > . regardless of number of candidates found populate module cache. > > > . return candidate list. > > > Then wire the result into bpf_core_apply_relo_insn(). > > > > > > When BPF program is trying to CO-RE relocate a type > > > that doesn't exist in either vmlinux BTF or in modules BTFs > > > these steps will perform 2 cache lookups when cache is hit. > > > > > > Note the cache doesn't prevent the abuse by the program that might > > > have lots of relocations that cannot be resolved. Hence cond_resched(). > > > > > > CO-RE in the kernel requires CAP_BPF, since BTF loading requires it. > > > > > > Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx> > > > --- > > > kernel/bpf/btf.c | 250 +++++++++++++++++++++++++++++++++++++- > > > tools/lib/bpf/relo_core.h | 2 + > > > 2 files changed, 251 insertions(+), 1 deletion(-) > > > > > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c > > > index dbf1f389b1d3..cf971b8a0769 100644 > > > --- a/kernel/bpf/btf.c > > > +++ b/kernel/bpf/btf.c > > > @@ -25,6 +25,7 @@ > > > #include <linux/kobject.h> > > > #include <linux/sysfs.h> > > > #include <net/sock.h> > > > +#include "../tools/lib/bpf/relo_core.h" > > > > > > > [...] > > > > > +static void populate_cand_cache(struct bpf_cand_cache *cands, > > > + struct bpf_cand_cache **cache, > > > + int cache_size) > > > +{ > > > + u32 hash = jhash_2words(cands->name_len, > > > + (((u32) cands->name[0]) << 8) | cands->name[1], 0); > > > > maybe add a helper func to calculate the hash given struct > > bpf_cand_cache to keep the logic always in sync? > > I felt it's trivial enough, but sure I can do that. > > > > + struct bpf_cand_cache *cc = cache[hash % cache_size]; > > > + > > > + if (cc) > > > + bpf_free_cands(cc); > > > + cache[hash % cache_size] = cands; > > > +} > > > + > > > > [...] > > > > > +static struct bpf_cand_cache * > > > +bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) > > > +{ > > > + const struct btf *local_btf = ctx->btf; > > > + const struct btf_type *local_type; > > > + const struct btf *main_btf; > > > + size_t local_essent_len; > > > + struct bpf_cand_cache *cands, *cc; > > > + struct btf *mod_btf; > > > + const char *name; > > > + int id; > > > + > > > + local_type = btf_type_by_id(local_btf, local_type_id); > > > + if (!local_type) > > > + return ERR_PTR(-EINVAL); > > > + > > > + name = btf_name_by_offset(local_btf, local_type->name_off); > > > + if (str_is_empty(name)) > > > + return ERR_PTR(-EINVAL); > > > + local_essent_len = bpf_core_essential_name_len(name); > > > + > > > + cands = kcalloc(1, sizeof(*cands), GFP_KERNEL); > > > + if (!cands) > > > + return ERR_PTR(-ENOMEM); > > > + cands->name = kmemdup_nul(name, local_essent_len, GFP_KERNEL); > > > > it's pretty minor, but you don't really need kmemdup_nul() until > > populate_cand_cache(), you can use name as is until you really need to > > cache cands > > I thought about it, but didn't do it, because it complicates the code: > bpf_core_add_cands() would somehow need to differentiate > whether cands->name came as a 1st call into bpf_core_add_cands() > or it's a subsequent call. ah, I initially missed that bpf_core_add_cans() can also free candidate list; yeah, it's fine, as I said minor > It could be a flag in struct bpf_cand_cache that will tell > whether bpf_free_cands() needs to be freed or not, but feels > unnecessary complex. right, flags suck > > > > > > + if (!cands->name) { > > > + kfree(cands); > > > + return ERR_PTR(-ENOMEM); > > > + } > > > + cands->kind = btf_kind(local_type); > > > + cands->name_len = local_essent_len; > > > + > > > + cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); > > > + if (cc) { > > > + if (cc->cnt) { > > > + bpf_free_cands(cands); > > > + return cc; > > > + } > > > + goto check_modules; > > > + } > > > + > > > + /* Attempt to find target candidates in vmlinux BTF first */ > > > + main_btf = bpf_get_btf_vmlinux(); > > > + cands = bpf_core_add_cands(cands, main_btf, 1); > > > + if (IS_ERR(cands)) > > > + return cands; > > > + > > > + /* populate cache even when cands->cnt == 0 */ > > > + populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); > > > + > > > + /* if vmlinux BTF has any candidate, don't go for module BTFs */ > > > + if (cands->cnt) > > > + return cands; > > > + > > > +check_modules: > > > + cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); > > > + if (cc) { > > > + bpf_free_cands(cands); > > > + /* if cache has it return it even if cc->cnt == 0 */ > > > + return cc; > > > + } > > > + > > > + /* If candidate is not found in vmlinux's BTF then search in module's BTFs */ > > > + spin_lock_bh(&btf_idr_lock); > > > + idr_for_each_entry(&btf_idr, mod_btf, id) { > > > + if (!btf_is_module(mod_btf)) > > > + continue; > > > + /* linear search could be slow hence unlock/lock > > > + * the IDR to avoiding holding it for too long > > > + */ > > > + btf_get(mod_btf); > > > + spin_unlock_bh(&btf_idr_lock); > > > + cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); > > > + if (IS_ERR(cands)) { > > > + btf_put(mod_btf); > > > + return cands; > > > + } > > > + spin_lock_bh(&btf_idr_lock); > > > + btf_put(mod_btf); > > > > either need to additionally btf_get(mod_btf) inside > > bpf_core_add_cands() not btf_put() it here if you added at least one > > candidate, as you are storing targ_btf inside bpf_core_add_cands() and > > dropping refcount might leave dangling pointer > > Module will not go away while cands are being searched and cache ops are done. > purge_cand_cache() is called from MODULE_STATE_GOING. > I've considered doing the purge from btf_put(), > but we don't guarantee the context in there, so mutex_lock > would complicate btf_put too much. > It's simpler to do purge from MODULE_STATE_GOING. > But more below... > > > > + for (i = 0; i < cc->cnt; i++) { > > > + bpf_log(ctx->log, > > > + "CO-RE relocating %s %s: found target candidate [%d]\n", > > > + btf_kind_str[cc->kind], cc->name, cc->cands[i].id); > > > + cands.cands[i].btf = cc->cands[i].btf; > > > + cands.cands[i].id = cc->cands[i].id; > > > + } > > > + cands.len = cc->cnt; > > > + mutex_unlock(&cand_cache_mutex); > > > + } > > > + > > > > cache is not locked at this point, so those cands.cands[i].btf objects > > might be freed now (if module got unloaded meanwhile), right? > > right, looks easier to do btf_get here while copying the pointer. > and do a loop of btf_put after bpf_core_apply_relo_insn. > Doing btf_get inside bpf_core_add_cands adds complexity to cache replacement > and purging. > Right now __purge_cand_cache is just kfree of the whole slot > with multiple btf pointers in there potentially from different modules. > With btf_get in add_cands it would need to be a loop and > bpf_free_cands would need to have a loop. > btw the earlier versions of this patch set had the same issue, > so thanks for the good catch! no prolem. Yeah, btf_get() here should work as well, I think. > > > This global sharing of that small cache seems to cause unnecessary > > headaches, tbh. It adds global mutex (which might also block for > > kcalloc). If you used that cache locally for processing single > > bpf_prog, you wouldn't need the locking. It can probably also simplify > > the refcounting, especially if you just btf_get(targ_btf) for each > > candidate and then btf_put() it after all relos are processed. You are > > also half-step away from removing the size restriction (just chain > > bpf_cand_caches together) and having a fixed bucket-size hash with > > non-fixed chain length (which probably would be totally fine for all > > practical purposes). > > and that would be a almost done hashtable? why add that complexity? well, my point was that you already have all this complexity :) I see avoiding global mutex as less complexity, hashmap parts are just annoying and mundate code, but not really a complexity. But if you insist on a shared global mini-cache, that's fine. > The size restriction is necessary anyway for a global cache. > Even for per-program hashtable some size restriction might be > necessary. CAP_BPF is a target for "researchers" to do > weird things with the kernel. > > > > > > > > + err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8, > > > + relo, relo_idx, ctx->btf, &cands); > > > + kfree(cands.cands); > > > + return err; > > > } > > > diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h > > > index f410691cc4e5..f7b0d698978c 100644 > > > --- a/tools/lib/bpf/relo_core.h > > > +++ b/tools/lib/bpf/relo_core.h > > > @@ -8,8 +8,10 @@ > > > > > > struct bpf_core_cand { > > > const struct btf *btf; > > > +#ifndef __KERNEL__ > > > const struct btf_type *t; > > > const char *name; > > > +#endif > > > > why? doesn't seem to be used and both t and name could be derived from > > btf and id > > exactly, that's why CO-RE in the kernel doesn't use them, > but libbpf uses both for debugging and for passing information > back and forth between layers. oh, I thought you added those fields initially and forgot to delete or something, didn't notice that you are just "opting them out" for __KERNEL__. I think libbpf code doesn't strictly need this, here's the diff that completely removes their use, it's pretty straightforward and minimal, so maybe instead of #ifdef'ing let's just do that? diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b59fede08ba7..95fa57eea289 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5179,15 +5179,18 @@ static int bpf_core_add_cands(struct bpf_core_cand *local_cand, struct bpf_core_cand_list *cands) { struct bpf_core_cand *new_cands, *cand; - const struct btf_type *t; - const char *targ_name; + const struct btf_type *t, *local_t; + const char *targ_name, *local_name; size_t targ_essent_len; int n, i; + local_t = btf__type_by_id(local_cand->btf, local_cand->id); + local_name = btf__str_by_offset(local_cand->btf, local_t->name_off); + n = btf__type_cnt(targ_btf); for (i = targ_start_id; i < n; i++) { t = btf__type_by_id(targ_btf, i); - if (btf_kind(t) != btf_kind(local_cand->t)) + if (btf_kind(t) != btf_kind(local_t)) continue; targ_name = btf__name_by_offset(targ_btf, t->name_off); @@ -5198,12 +5201,12 @@ static int bpf_core_add_cands(struct bpf_core_cand *local_cand, if (targ_essent_len != local_essent_len) continue; - if (strncmp(local_cand->name, targ_name, local_essent_len) != 0) + if (strncmp(local_name, targ_name, local_essent_len) != 0) continue; pr_debug("CO-RE relocating [%d] %s %s: found target candidate [%d] %s %s in [%s]\n", - local_cand->id, btf_kind_str(local_cand->t), - local_cand->name, i, btf_kind_str(t), targ_name, + local_cand->id, btf_kind_str(local_t), + local_name, i, btf_kind_str(t), targ_name, targ_btf_name); new_cands = libbpf_reallocarray(cands->cands, cands->len + 1, sizeof(*cands->cands)); @@ -5212,8 +5215,6 @@ static int bpf_core_add_cands(struct bpf_core_cand *local_cand, cand = &new_cands[cands->len]; cand->btf = targ_btf; - cand->t = t; - cand->name = targ_name; cand->id = i; cands->cands = new_cands; @@ -5320,18 +5321,20 @@ bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 l struct bpf_core_cand local_cand = {}; struct bpf_core_cand_list *cands; const struct btf *main_btf; + const struct btf_type *local_t; + const char *local_name; size_t local_essent_len; int err, i; local_cand.btf = local_btf; - local_cand.t = btf__type_by_id(local_btf, local_type_id); - if (!local_cand.t) + local_t = btf__type_by_id(local_btf, local_type_id); + if (!local_t) return ERR_PTR(-EINVAL); - local_cand.name = btf__name_by_offset(local_btf, local_cand.t->name_off); - if (str_is_empty(local_cand.name)) + local_name = btf__name_by_offset(local_btf, local_t->name_off); + if (str_is_empty(local_name)) return ERR_PTR(-EINVAL); - local_essent_len = bpf_core_essential_name_len(local_cand.name); + local_essent_len = bpf_core_essential_name_len(local_name); cands = calloc(1, sizeof(*cands)); if (!cands) diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index 3b9f8f18346c..0dc0b9256bea 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -77,8 +77,6 @@ struct bpf_core_relo { struct bpf_core_cand { const struct btf *btf; - const struct btf_type *t; - const char *name; __u32 id; };