On Sun, Mar 28, 2021 at 1:14 PM Yonghong Song <yhs@xxxxxx> wrote: > > For vmlinux built with clang thin-lto or lto, there exist > cross cu type references. For example, the below can happen: > compile unit 1: > tag 10: type A > compile unit 2: > ... > refer to type A (tag 10 in compile unit 1) > I only checked a few but have seen type A may be a simple type > like "unsigned char" or a complex type like an array of base types. > > To resolve this issue, the tag DW_AT_producer of the first > DW_TAG_compile_unit is checked. If the binary is built > with clang lto, all debuginfo dwarf cu's will be merged > into one pahole cu which will resolve the above > cross-cu tag reference issue. To test whether a binary > is built with clang lto or not, The "clang version" > and "-flto" will be checked against DW_AT_producer string > for the first 5 debuginfo cu's. The reason is that > a few linux files disabled lto for various reasons. > > Merging cu's will create a single cu with lots of types, tags > and functions. For example with clang thin-lto built vmlinux, > I saw 9M entries in types table, 5.2M in tags table. The > below are pahole wallclock time for different hashbits: > command line: time pahole -J vmlinux > # of hashbits wallclock time in seconds > 15 460 > 16 255 > 17 131 > 18 97 > 19 75 > 20 69 > 21 64 > 22 62 > 23 58 > 24 64 > > The problem is with hashtags__find(), esp. the loop > uint32_t bucket = hashtags__fn(id); > const struct hlist_head *head = hashtable + bucket; > hlist_for_each_entry(tpos, pos, head, hash_node) { > if (tpos->id == id) > return tpos; > } > > Say we have 9M types and (1 << 15) buckets, that means each bucket > will have roughly 64 elements. So each lookup will traverse > the loop 32 iterations on average. > > If we have 1 << 21 buckets, then each buckets will have 4 elements, > and the average number of loop iterations for hashtags__find() > will be 2. > > Note that the number of hashbits 24 makes performance worse > than 23. The reason could be that 23 hashbits can cover 8M > buckets (close to 9M for the number of entries in types table). > Higher number of hash bits allocates more memory and becomes > less cache efficient compared to 23 hashbits. > > This patch picks # of hashbits 21 as the starting value > and will try to allocate memory based on that, if memory > allocation fails, we will go with less hashbits until > we reach hashbits 15 which is the default for > non merge-cu case. > > Signed-off-by: Yonghong Song <yhs@xxxxxx> > --- > dwarf_loader.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 120 insertions(+) > > diff --git a/dwarf_loader.c b/dwarf_loader.c > index aa6372a..a51391e 100644 > --- a/dwarf_loader.c > +++ b/dwarf_loader.c > @@ -51,6 +51,7 @@ struct strings *strings; > #endif > > static uint32_t hashtags__bits = 15; > +static uint32_t max_hashtags__bits = 21; > > static uint32_t hashtags__fn(Dwarf_Off key) > { > @@ -2484,6 +2485,115 @@ static int cus__load_debug_types(struct cus *cus, struct conf_load *conf, > return 0; > } > > +static bool cus__merging_cu(Dwarf *dw) > +{ > + uint8_t pointer_size, offset_size; > + Dwarf_Off off = 0, noff; > + size_t cuhl; > + int cnt = 0; > + > + /* > + * Just checking the first cu is not enough. > + * In linux, some C files may have LTO is disabled, e.g., > + * e242db40be27 x86, vdso: disable LTO only for vDSO > + * d2dcd3e37475 x86, cpu: disable LTO for cpu.c > + * Fortunately, disabling LTO for a particular file in a LTO build > + * is rather an exception. Iterating 5 cu's to check whether > + * LTO is used or not should be enough. > + */ > + while (dwarf_nextcu(dw, off, &noff, &cuhl, NULL, &pointer_size, > + &offset_size) == 0) { > + Dwarf_Die die_mem; > + Dwarf_Die *cu_die = dwarf_offdie(dw, off + cuhl, &die_mem); > + > + if (cu_die == NULL) > + break; > + > + if (++cnt > 5) > + break; > + > + const char *producer = attr_string(cu_die, DW_AT_producer); > + if (strstr(producer, "clang version") != NULL && > + strstr(producer, "-flto") != NULL) Instead of checking for flags, which can be a bit brittle, would it make more sense to scan the abbreviations to see if there are any "sec_offset" encodings used for type attributes to indicate that LTO was used? Thank you for improving on my hacky patch! :-) -bw > + return true; > + > + off = noff; > + } > + > + return false; > +} > + > +static int cus__merge_and_process_cu(struct cus *cus, struct conf_load *conf, > + Dwfl_Module *mod, Dwarf *dw, Elf *elf, > + const char *filename, > + const unsigned char *build_id, > + int build_id_len, > + struct dwarf_cu *type_dcu) > +{ > + uint8_t pointer_size, offset_size; > + struct dwarf_cu *dcu = NULL; > + Dwarf_Off off = 0, noff; > + struct cu *cu = NULL; > + size_t cuhl; > + > + while (dwarf_nextcu(dw, off, &noff, &cuhl, NULL, &pointer_size, > + &offset_size) == 0) { > + Dwarf_Die die_mem; > + Dwarf_Die *cu_die = dwarf_offdie(dw, off + cuhl, &die_mem); > + > + if (cu_die == NULL) > + break; > + > + if (cu == NULL) { > + cu = cu__new("", pointer_size, build_id, build_id_len, > + filename); > + if (cu == NULL || cu__set_common(cu, conf, mod, elf) != 0) > + return DWARF_CB_ABORT; > + > + dcu = malloc(sizeof(struct dwarf_cu)); > + if (dcu == NULL) > + return DWARF_CB_ABORT; > + > + /* Merged cu tends to need a lot more memory. > + * Let us start with max_hashtags__bits and > + * go down to find a proper hashtag bit value. > + */ > + uint32_t default_hbits = hashtags__bits; > + for (hashtags__bits = max_hashtags__bits; > + hashtags__bits >= default_hbits; > + hashtags__bits--) { > + if (dwarf_cu__init(dcu) == 0) > + break; > + } > + if (hashtags__bits < default_hbits) > + return DWARF_CB_ABORT; > + > + dcu->cu = cu; > + dcu->type_unit = type_dcu; > + cu->priv = dcu; > + cu->dfops = &dwarf__ops; > + cu->language = attr_numeric(cu_die, DW_AT_language); > + } > + > + Dwarf_Die child; > + if (dwarf_child(cu_die, &child) == 0) { > + if (die__process_unit(&child, cu) != 0) > + return DWARF_CB_ABORT; > + } > + > + off = noff; > + } > + > + /* process merged cu */ > + if (cu__recode_dwarf_types(cu) != LSK__KEEPIT) > + return DWARF_CB_ABORT; > + if (finalize_cu_immediately(cus, cu, dcu, conf) > + == LSK__STOP_LOADING) > + return DWARF_CB_ABORT; > + > + return 0; > +} > + > static int cus__load_module(struct cus *cus, struct conf_load *conf, > Dwfl_Module *mod, Dwarf *dw, Elf *elf, > const char *filename) > @@ -2518,6 +2628,15 @@ static int cus__load_module(struct cus *cus, struct conf_load *conf, > } > } > > + if (cus__merging_cu(dw)) { > + res = cus__merge_and_process_cu(cus, conf, mod, dw, elf, filename, > + build_id, build_id_len, > + type_cu ? &type_dcu : NULL); > + if (res) > + return res; > + goto out; > + } > + > while (dwarf_nextcu(dw, off, &noff, &cuhl, NULL, &pointer_size, > &offset_size) == 0) { > Dwarf_Die die_mem; > @@ -2557,6 +2676,7 @@ static int cus__load_module(struct cus *cus, struct conf_load *conf, > off = noff; > } > > +out: > if (type_lsk == LSK__DELETE) > cu__delete(type_cu); > > -- > 2.30.2 >