From: Alexei Starovoitov <ast@xxxxxxxxxx> Given BPF program's BTF root type name perform the following steps: . search in vmlinux candidate cache. . if (present in cache and candidate list >= 1) return candidate list. . do a linear search through kernel BTFs for possible candidates. . regardless of number of candidates found populate vmlinux cache. . if (candidate list >= 1) return candidate list. . search in module candidate cache. . if (present in cache) return candidate list (even if list is empty). . do a linear search through BTFs of all kernel modules collecting candidates from all of them. . regardless of number of candidates found populate module cache. . return candidate list. Then wire the result into bpf_core_apply_relo_insn(). When BPF program is trying to CO-RE relocate a type that doesn't exist in either vmlinux BTF or in modules BTFs these steps will perform 2 cache lookups when cache is hit. Note the cache doesn't prevent the abuse by the program that might have lots of relocations that cannot be resolved. Hence cond_resched(). CO-RE in the kernel requires CAP_BPF, since BTF loading requires it. Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx> --- kernel/bpf/btf.c | 250 +++++++++++++++++++++++++++++++++++++- tools/lib/bpf/relo_core.h | 2 + 2 files changed, 251 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbf1f389b1d3..cf971b8a0769 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -25,6 +25,7 @@ #include <linux/kobject.h> #include <linux/sysfs.h> #include <net/sock.h> +#include "../tools/lib/bpf/relo_core.h" /* BTF (BPF Type Format) is the meta data format which describes * the data types of BPF program/map. Hence, it basically focus @@ -6169,6 +6170,8 @@ btf_module_read(struct file *file, struct kobject *kobj, return len; } +static void purge_cand_cache(struct btf *btf); + static int btf_module_notify(struct notifier_block *nb, unsigned long op, void *module) { @@ -6245,6 +6248,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, list_del(&btf_mod->list); if (btf_mod->sysfs_attr) sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); + purge_cand_cache(btf_mod->btf); btf_put(btf_mod->btf); kfree(btf_mod->sysfs_attr); kfree(btf_mod); @@ -6440,8 +6444,252 @@ size_t bpf_core_essential_name_len(const char *name) return n; } +struct bpf_cand_cache { + const char *name; + u32 name_len; + u16 kind; + u16 cnt; + struct { + const struct btf *btf; + u32 id; + } cands[]; +}; + +static void bpf_free_cands(struct bpf_cand_cache *cands) +{ + kfree(cands->name); + kfree(cands); +} + +#define VMLINUX_CAND_CACHE_SIZE 31 +static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE]; + +#define MODULE_CAND_CACHE_SIZE 31 +static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE]; + +static DEFINE_MUTEX(cand_cache_mutex); + +static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands, + struct bpf_cand_cache **cache, + int cache_size) +{ + u32 hash = jhash_2words(cands->name_len, + (((u32) cands->name[0]) << 8) | cands->name[1], 0); + struct bpf_cand_cache *cc = cache[hash % cache_size]; + + if (cc && cc->name_len == cands->name_len && + !strncmp(cc->name, cands->name, cands->name_len)) + return cc; + return NULL; +} + +static void populate_cand_cache(struct bpf_cand_cache *cands, + struct bpf_cand_cache **cache, + int cache_size) +{ + u32 hash = jhash_2words(cands->name_len, + (((u32) cands->name[0]) << 8) | cands->name[1], 0); + struct bpf_cand_cache *cc = cache[hash % cache_size]; + + if (cc) + bpf_free_cands(cc); + cache[hash % cache_size] = cands; +} + +static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache *cc; + int i, j; + + for (i = 0; i < cache_size; i++) { + cc = cache[i]; + if (!cc) + continue; + for (j = 0; j < cc->cnt; j++) + if (cc->cands[j].btf == btf) { + bpf_free_cands(cc); + cache[i] = NULL; + break; + } + } + +} + +static void purge_cand_cache(struct btf *btf) +{ + mutex_lock(&cand_cache_mutex); + __purge_cand_cache(btf, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE); + mutex_unlock(&cand_cache_mutex); +} + +static struct bpf_cand_cache * +bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf, + int targ_start_id) +{ + struct bpf_cand_cache *new_cands; + const struct btf_type *t; + const char *targ_name; + size_t targ_essent_len; + int n, i; + + n = btf_nr_types(targ_btf); + for (i = targ_start_id; i < n; i++) { + t = btf_type_by_id(targ_btf, i); + if (btf_kind(t) != cands->kind) + continue; + + targ_name = btf_name_by_offset(targ_btf, t->name_off); + if (!targ_name) + continue; + + cond_resched(); + + if (strncmp(cands->name, targ_name, cands->name_len) != 0) + continue; + + targ_essent_len = bpf_core_essential_name_len(targ_name); + if (targ_essent_len != cands->name_len) + continue; + + new_cands = krealloc(cands, + offsetof(struct bpf_cand_cache, cands[cands->cnt + 1]), + GFP_KERNEL); + if (!new_cands) { + bpf_free_cands(cands); + return ERR_PTR(-ENOMEM); + } + + cands = new_cands; + cands->cands[cands->cnt].btf = targ_btf; + cands->cands[cands->cnt].id = i; + cands->cnt++; + } + return cands; +} + +static struct bpf_cand_cache * +bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) +{ + const struct btf *local_btf = ctx->btf; + const struct btf_type *local_type; + const struct btf *main_btf; + size_t local_essent_len; + struct bpf_cand_cache *cands, *cc; + struct btf *mod_btf; + const char *name; + int id; + + local_type = btf_type_by_id(local_btf, local_type_id); + if (!local_type) + return ERR_PTR(-EINVAL); + + name = btf_name_by_offset(local_btf, local_type->name_off); + if (str_is_empty(name)) + return ERR_PTR(-EINVAL); + local_essent_len = bpf_core_essential_name_len(name); + + cands = kcalloc(1, sizeof(*cands), GFP_KERNEL); + if (!cands) + return ERR_PTR(-ENOMEM); + cands->name = kmemdup_nul(name, local_essent_len, GFP_KERNEL); + if (!cands->name) { + kfree(cands); + return ERR_PTR(-ENOMEM); + } + cands->kind = btf_kind(local_type); + cands->name_len = local_essent_len; + + cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + if (cc) { + if (cc->cnt) { + bpf_free_cands(cands); + return cc; + } + goto check_modules; + } + + /* Attempt to find target candidates in vmlinux BTF first */ + main_btf = bpf_get_btf_vmlinux(); + cands = bpf_core_add_cands(cands, main_btf, 1); + if (IS_ERR(cands)) + return cands; + + /* populate cache even when cands->cnt == 0 */ + populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + + /* if vmlinux BTF has any candidate, don't go for module BTFs */ + if (cands->cnt) + return cands; + +check_modules: + cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); + if (cc) { + bpf_free_cands(cands); + /* if cache has it return it even if cc->cnt == 0 */ + return cc; + } + + /* If candidate is not found in vmlinux's BTF then search in module's BTFs */ + spin_lock_bh(&btf_idr_lock); + idr_for_each_entry(&btf_idr, mod_btf, id) { + if (!btf_is_module(mod_btf)) + continue; + /* linear search could be slow hence unlock/lock + * the IDR to avoiding holding it for too long + */ + btf_get(mod_btf); + spin_unlock_bh(&btf_idr_lock); + cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); + if (IS_ERR(cands)) { + btf_put(mod_btf); + return cands; + } + spin_lock_bh(&btf_idr_lock); + btf_put(mod_btf); + } + spin_unlock_bh(&btf_idr_lock); + /* populate cache even when cands->cnt == 0 */ + populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); + return cands; +} + int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn) { - return -EOPNOTSUPP; + struct bpf_core_cand_list cands = {}; + int err; + + if (relo->kind != BPF_CORE_TYPE_ID_LOCAL) { + struct bpf_cand_cache *cc; + int i; + + mutex_lock(&cand_cache_mutex); + cc = bpf_core_find_cands(ctx, relo->type_id); + if (IS_ERR(cc)) { + bpf_log(ctx->log, "target candidate search failed for %d\n", + relo->type_id); + return PTR_ERR(cc); + } + if (cc->cnt) { + cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL); + if (!cands.cands) + return -ENOMEM; + } + for (i = 0; i < cc->cnt; i++) { + bpf_log(ctx->log, + "CO-RE relocating %s %s: found target candidate [%d]\n", + btf_kind_str[cc->kind], cc->name, cc->cands[i].id); + cands.cands[i].btf = cc->cands[i].btf; + cands.cands[i].id = cc->cands[i].id; + } + cands.len = cc->cnt; + mutex_unlock(&cand_cache_mutex); + } + + err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8, + relo, relo_idx, ctx->btf, &cands); + kfree(cands.cands); + return err; } diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index f410691cc4e5..f7b0d698978c 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -8,8 +8,10 @@ struct bpf_core_cand { const struct btf *btf; +#ifndef __KERNEL__ const struct btf_type *t; const char *name; +#endif __u32 id; }; -- 2.30.2