On Tue, 2024-05-28 at 13:24 +0100, Alan Maguire wrote: [...] > +/* Build a map from distilled base BTF ids to base BTF ids. To do so, iterate > + * through base BTF looking up distilled type (using binary search) equivalents. > + */ > +static int btf_relocate_map_distilled_base(struct btf_relocate *r) > +{ I have several observations about this algorithm. The algorithm does Base.Cnt * log2(Dist.Cnt) binary searches. However, it might be better to switch searches around and look for distilled {name,size} pairs in base btf, doing Dist.Cnt * log2(Base.cnt) searches instead. Suppose Base.Cnt = 2**20 and Dist.Cnt = 2**10, in such case: - Base.Cnt * log2(Dist.Cnt) = 2**20 * 10 - Dist.Cnt * log2(Base.cnt) = 2**10 * 20, which is smaller The algorithm might not handle name duplicates in the distilled BTF well, e.g. in theory, the following is a valid C code struct foo { int f; }; // sizeof(struct foo) == 4 typedef int foo; // sizeof(foo) == 4 Suppose that these types are a part of the distilled BTF. Depending on which one would end up first in 'dist_base_info_sorted' bsearch might fail to find one or the other. Also, algorithm does not report an error if there are several types with the same name and size in the base BTF. I suggest to modify the algorithm as follows: - let 'base_info_sorted' be a set of tuples {kind,name,size,id} corresponding to the base BTF, sorted by kind, name and size; - add a custom utility bsearch_unique, that behaves like bsearch, but returns NULL if entry is non-unique with regards to current predicate (e.g. use bsearch but also check neighbors); - for each type D in the distilled base: - use bsearch_unique to find entry E in 'base_info_sorted' that matches D.{kind,name,size} sub-tuple; - if E exists, set id_map[D] := E.id; - if E does not exist: - if id_map[D] == BTF_IS_EMBEDDED, report an error; - if id_map[D] != BTF_IS_EMBEDDED: - use bsearch_unique to find entry E in 'base_info_sorted' that matches D.{kind,name} sub-tuple; - if E exists, set id_map[D] := E.id; - otherwise, report an error. This allows to: - flip the search order, potentially gaining some speed; - drop the 'base_name_cnt' array and logic; - handle the above hypothetical name conflict example. Wdyt? > + struct btf_name_info *dist_base_info_sorted; > + struct btf_type *base_t, *dist_t, *split_t; > + __u8 *base_name_cnt = NULL; > + int err = 0; > + __u32 id; > + > + /* generate a sort index array of name/type ids sorted by name for > + * distilled base BTF to speed name-based lookups. > + */ > + dist_base_info_sorted = calloc(r->nr_dist_base_types, sizeof(*dist_base_info_sorted)); > + if (!dist_base_info_sorted) { > + err = -ENOMEM; > + goto done; > + } > + for (id = 0; id < r->nr_dist_base_types; id++) { > + dist_t = btf_type_by_id(r->dist_base_btf, id); > + dist_base_info_sorted[id].name = btf__name_by_offset(r->dist_base_btf, > + dist_t->name_off); > + dist_base_info_sorted[id].id = id; > + dist_base_info_sorted[id].size = dist_t->size; > + dist_base_info_sorted[id].needs_size = true; > + } > + qsort(dist_base_info_sorted, r->nr_dist_base_types, sizeof(*dist_base_info_sorted), > + cmp_btf_name_size); > + > + /* Mark distilled base struct/union members of split BTF structs/unions > + * in id_map with BTF_IS_EMBEDDED; this signals that these types > + * need to match both name and size, otherwise embeddding the base > + * struct/union in the split type is invalid. > + */ > + for (id = r->nr_dist_base_types; id < r->nr_split_types; id++) { > + split_t = btf_type_by_id(r->btf, id); > + if (btf_is_composite(split_t)) { > + err = btf_type_visit_type_ids(split_t, btf_mark_embedded_composite_type_ids, > + r); > + if (err < 0) > + goto done; > + } > + } > + > + /* Collect name counts for composite types in base BTF. If multiple > + * instances of a struct/union of the same name exist, we need to use > + * size to determine which to map to since name alone is ambiguous. > + */ > + base_name_cnt = calloc(r->base_str_len, sizeof(*base_name_cnt)); > + if (!base_name_cnt) { > + err = -ENOMEM; > + goto done; > + } > + for (id = 1; id < r->nr_base_types; id++) { > + base_t = btf_type_by_id(r->base_btf, id); > + if (!btf_is_composite(base_t) || !base_t->name_off) > + continue; > + if (base_name_cnt[base_t->name_off] < 255) > + base_name_cnt[base_t->name_off]++; > + } > + > + /* Now search base BTF for matching distilled base BTF types. */ > + for (id = 1; id < r->nr_base_types; id++) { > + struct btf_name_info *dist_name_info, base_name_info = {}; > + int dist_kind, base_kind; > + > + base_t = btf_type_by_id(r->base_btf, id); > + /* distilled base consists of named types only. */ > + if (!base_t->name_off) > + continue; > + base_kind = btf_kind(base_t); > + base_name_info.id = id; > + base_name_info.name = btf__name_by_offset(r->base_btf, base_t->name_off); > + switch (base_kind) { > + case BTF_KIND_INT: > + case BTF_KIND_FLOAT: > + case BTF_KIND_ENUM: > + case BTF_KIND_ENUM64: > + /* These types should match both name and size */ > + base_name_info.needs_size = true; > + base_name_info.size = base_t->size; > + break; > + case BTF_KIND_FWD: > + /* No size considerations for fwds. */ > + break; > + case BTF_KIND_STRUCT: > + case BTF_KIND_UNION: > + /* Size only needs to be used for struct/union if there > + * are multiple types in base BTF with the same name. > + * If there are multiple _distilled_ types with the same > + * name (a very unlikely scenario), that doesn't matter > + * unless corresponding _base_ types to match them are > + * missing. > + */ > + base_name_info.needs_size = base_name_cnt[base_t->name_off] > 1; > + base_name_info.size = base_t->size; > + break; > + default: > + continue; > + } > + dist_name_info = bsearch(&base_name_info, dist_base_info_sorted, > + r->nr_dist_base_types, sizeof(*dist_base_info_sorted), > + cmp_btf_name_size); > + if (!dist_name_info) > + continue; > + if (!dist_name_info->id || dist_name_info->id > r->nr_dist_base_types) { > + pr_warn("base BTF id [%d] maps to invalid distilled base BTF id [%d]\n", > + id, dist_name_info->id); > + err = -EINVAL; > + goto done; > + } > + dist_t = btf_type_by_id(r->dist_base_btf, dist_name_info->id); > + dist_kind = btf_kind(dist_t); > + > + /* Validate that the found distilled type is compatible. > + * Do not error out on mismatch as another match may occur > + * for an identically-named type. > + */ > + switch (dist_kind) { > + case BTF_KIND_FWD: > + switch (base_kind) { > + case BTF_KIND_FWD: > + if (btf_kflag(dist_t) != btf_kflag(base_t)) > + continue; > + break; > + case BTF_KIND_STRUCT: > + if (btf_kflag(base_t)) > + continue; > + break; > + case BTF_KIND_UNION: > + if (!btf_kflag(base_t)) > + continue; > + break; > + default: > + continue; > + } > + break; > + case BTF_KIND_INT: > + if (dist_kind != base_kind || > + btf_int_encoding(base_t) != btf_int_encoding(dist_t)) > + continue; > + break; > + case BTF_KIND_FLOAT: > + if (dist_kind != base_kind) > + continue; > + break; > + case BTF_KIND_ENUM: > + /* ENUM and ENUM64 are encoded as sized ENUM in > + * distilled base BTF. > + */ > + if (dist_kind != base_kind && base_kind != BTF_KIND_ENUM64) > + continue; > + break; > + case BTF_KIND_STRUCT: > + case BTF_KIND_UNION: > + /* size verification is required for embedded > + * struct/unions. > + */ > + if (r->id_map[dist_name_info->id] == BTF_IS_EMBEDDED && > + base_t->size != dist_t->size) > + continue; > + break; > + default: > + continue; > + } > + /* map id and name */ > + r->id_map[dist_name_info->id] = id; > + r->str_map[dist_t->name_off] = base_t->name_off; > + } > + /* ensure all distilled BTF ids now have a mapping... */ > + for (id = 1; id < r->nr_dist_base_types; id++) { > + const char *name; > + > + if (r->id_map[id] && r->id_map[id] != BTF_IS_EMBEDDED) > + continue; > + dist_t = btf_type_by_id(r->dist_base_btf, id); > + name = btf__name_by_offset(r->dist_base_btf, dist_t->name_off); > + pr_warn("distilled base BTF type '%s' [%d] is not mapped to base BTF id\n", > + name, id); > + err = -EINVAL; > + break; > + } > +done: > + free(base_name_cnt); > + free(dist_base_info_sorted); > + return err; > +} [...]