Change multithreaded implementation of BTF encoding: * Use a single btf_encoder accumulating BTF for all compilation units * Make BTF encoding routine exclusive: only one thread at a time may execute btf_encoder__encode_cu * Introduce CU ids: an id is an index of a CU, in order they are created in dwarf_loader.c * Introduce CU__PROCESSED cu_state to inidicate what CUs have been processed by the encoder * Enforce encoding order of compilation units (struct cu) loaded from DWARF by utilizing global struct cus as a queue * reproducible_build option is now moot: BTF encoding is always reproducible with this change * Most of the code that merged the results of multiple BTF encoders into one BTF after CU processing is removed Motivation behind this change and analysis that led to it are in the cover letter to the patch series. In short, this implementation of BTF encoding makes it reproducible without sacrificing the performance gains from parallel processing. The speed in terms of wall-clock time is comparable to non-reproducible runs on pahole/next [1]. The memory footprint is lower with increased number of threads. pahole/next (12ca112): Performance counter stats for '/home/theihor/dev/dwarves/build/pahole -J -j24 --btf_features=encode_force,var,float,enum64,decl_tag,type_tag,optimized_func,consistent_func,decl_tag_kfuncs --btf_encode_detached=/dev/null --lang_exclude=rust /home/theihor/git/kernel.org/bpf-next/kbuild-output/.tmp_vmlinux1' (13 runs): 50,493,244,369 cycles ( +- 0.26% ) 1.6863 +- 0.0150 seconds time elapsed ( +- 0.89% ) jobs 1, mem 546556 Kb, time 4.53 sec jobs 2, mem 599776 Kb, time 2.81 sec jobs 4, mem 661756 Kb, time 2.05 sec jobs 8, mem 764584 Kb, time 1.58 sec jobs 16, mem 844856 Kb, time 1.59 sec jobs 32, mem 1047880 Kb, time 1.69 sec This patchset on top of pahole/next: Performance counter stats for '/home/theihor/dev/dwarves/build/pahole -J -j24 --btf_features=encode_force,var,float,enum64,decl_tag,type_tag,optimized_func,consistent_func,decl_tag_kfuncs --btf_encode_detached=/dev/null --lang_exclude=rust /home/theihor/git/kernel.org/bpf-next/kbuild-output/.tmp_vmlinux1' (13 runs): 31,175,635,417 cycles ( +- 0.22% ) 1.58644 +- 0.00501 seconds time elapsed ( +- 0.32% ) jobs 1, mem 544780 Kb, time 4.47 sec jobs 2, mem 553944 Kb, time 4.68 sec jobs 4, mem 563352 Kb, time 2.36 sec jobs 8, mem 585508 Kb, time 1.73 sec jobs 16, mem 635212 Kb, time 1.61 sec jobs 32, mem 772752 Kb, time 1.59 sec [1]: https://git.kernel.org/pub/scm/devel/pahole/pahole.git/commit/?h=next&id=12ca11281912c272f931e836b9160ee827250716 Signed-off-by: Ihor Solodrai <ihor.solodrai@xxxxx> --- dwarf_loader.c | 4 + dwarves.c | 47 ++++----- dwarves.h | 5 +- pahole.c | 255 +++++++++++++++++++------------------------------ 4 files changed, 127 insertions(+), 184 deletions(-) diff --git a/dwarf_loader.c b/dwarf_loader.c index 5d55649..86f8b92 100644 --- a/dwarf_loader.c +++ b/dwarf_loader.c @@ -3440,6 +3440,7 @@ struct dwarf_cus { int build_id_len; int error; struct dwarf_cu *type_dcu; + uint32_t nr_cus_created; }; struct dwarf_thread { @@ -3472,6 +3473,9 @@ static struct dwarf_cu *dwarf_cus__create_cu(struct dwarf_cus *dcus, Dwarf_Die * cu->priv = dcu; cu->dfops = &dwarf__ops; + cu->id = dcus->nr_cus_created; + dcus->nr_cus_created++; + return dcu; } diff --git a/dwarves.c b/dwarves.c index ae512b9..e8701ba 100644 --- a/dwarves.c +++ b/dwarves.c @@ -537,39 +537,18 @@ void cus__set_cu_state(struct cus *cus, struct cu *cu, enum cu_state state) cus__unlock(cus); } -// Used only when reproducible builds are desired -struct cu *cus__get_next_processable_cu(struct cus *cus) +struct cu *cus__get_cu_by_id(struct cus *cus, uint32_t id) { - struct cu *cu; - + struct cu *cu, *result = NULL; cus__lock(cus); - list_for_each_entry(cu, &cus->cus, node) { - switch (cu->state) { - case CU__LOADED: - cu->state = CU__PROCESSING; - goto found; - case CU__PROCESSING: - // This will happen when we get to parallel - // reproducible BTF encoding, libbpf dedup work needed - // here. The other possibility is when we're flushing - // the DWARF processed CUs when the parallel DWARF - // loading stoped and we still have CUs to encode to - // BTF because of ordering requirements. - continue; - case CU__UNPROCESSED: - // The first entry isn't loaded, signal the - // caller to return and try another day, as we - // need to respect the original DWARF CU ordering. - goto out; + if (cu->id == id) { + result = cu; + break; } } -out: - cu = NULL; -found: cus__unlock(cus); - - return cu; + return result; } bool cus__empty(const struct cus *cus) @@ -610,6 +589,20 @@ void cus__add(struct cus *cus, struct cu *cu) cu__find_class_holes(cu); } +void cus__remove_processed_cus(struct cus *cus) +{ + struct cu *cu, *tmp; + + cus__lock(cus); + list_for_each_entry_safe(cu, tmp, &cus->cus, node) { + if (cu->state == CU__PROCESSED) { + __cus__remove(cus, cu); + cu__delete(cu); + } + } + cus__unlock(cus); +} + static void ptr_table__init(struct ptr_table *pt) { pt->entries = NULL; diff --git a/dwarves.h b/dwarves.h index 1a0bd4b..8af4045 100644 --- a/dwarves.h +++ b/dwarves.h @@ -49,6 +49,7 @@ enum cu_state { CU__UNPROCESSED, CU__LOADED, CU__PROCESSING, + CU__PROCESSED, }; /* @@ -194,8 +195,9 @@ void cus__add(struct cus *cus, struct cu *cu); void __cus__remove(struct cus *cus, struct cu *cu); void cus__remove(struct cus *cus, struct cu *cu); +void cus__remove_processed_cus(struct cus *cus); -struct cu *cus__get_next_processable_cu(struct cus *cus); +struct cu *cus__get_cu_by_id(struct cus *cus, uint32_t id); void cus__set_cu_state(struct cus *cus, struct cu *cu, enum cu_state state); @@ -297,6 +299,7 @@ struct cu { struct ptr_table functions_table; struct ptr_table tags_table; struct rb_root functions; + uint32_t id; const char *name; char *filename; void *priv; diff --git a/pahole.c b/pahole.c index 6b46399..f56c66f 100644 --- a/pahole.c +++ b/pahole.c @@ -94,6 +94,7 @@ static struct conf_fprintf conf = { static struct conf_load conf_load = { .conf_fprintf = &conf, + .nr_jobs = 1, }; struct structure { @@ -3136,12 +3137,10 @@ static bool print_enumeration_with_enumerator(struct cu *cu, const char *name) return false; } -struct thread_data { - struct btf *btf; - struct btf_encoder *encoder; -}; +// not used yet +struct thread_data { int dummy; }; -static int pahole_threads_prepare_reproducible_build(struct conf_load *conf, int nr_threads, void **thr_data) +static int pahole_threads_prepare(struct conf_load *conf, int nr_threads, void **thr_data) { for (int i = 0; i < nr_threads; i++) thr_data[i] = NULL; @@ -3149,17 +3148,6 @@ static int pahole_threads_prepare_reproducible_build(struct conf_load *conf, int return 0; } -static int pahole_threads_prepare(struct conf_load *conf, int nr_threads, void **thr_data) -{ - int i; - struct thread_data *threads = calloc(sizeof(struct thread_data), nr_threads); - - for (i = 0; i < nr_threads; i++) - thr_data[i] = threads + i; - - return 0; -} - static int pahole_thread_exit(struct conf_load *conf, void *thr_data) { struct thread_data *thread = thr_data; @@ -3179,7 +3167,6 @@ static int pahole_threads_collect(struct conf_load *conf, int nr_threads, void * int error) { struct thread_data **threads = (struct thread_data **)thr_data; - int i; int err = 0; if (error) @@ -3189,29 +3176,97 @@ static int pahole_threads_collect(struct conf_load *conf, int nr_threads, void * if (err < 0) goto out; - for (i = 0; i < nr_threads; i++) { - /* - * Merge content of the btf instances of worker threads to the btf - * instance of the primary btf_encoder. - */ - if (!threads[i]->encoder || !threads[i]->btf) - continue; - err = btf_encoder__add_encoder(btf_encoder, threads[i]->encoder); - if (err < 0) - goto out; +out: + free(threads[0]); + + return err; +} + +static inline void btf_encoder__init(struct cu *cu, struct conf_load *conf_load) +{ + btf_encoder = btf_encoder__new(cu, + detached_btf_filename, + conf_load->base_btf, + global_verbose, + conf_load); + if (!btf_encoder) { + fprintf(stderr, "Error creating BTF encoder.\n"); + exit(1); } - err = 0; +} -out: - for (i = 0; i < nr_threads; i++) { - if (threads[i]->encoder && threads[i]->encoder != btf_encoder) { - btf_encoder__delete(threads[i]->encoder); - threads[i]->encoder = NULL; +static enum load_steal_kind pahole_stealer__threaded_btf_encode(struct cu *cu, struct conf_load *conf_load) +{ + static pthread_mutex_t btf_encode_lock = PTHREAD_MUTEX_INITIALIZER; + static uint32_t next_cu_id_to_encode; + + struct cu *cu_to_encode = NULL; + int encoding_ret; + + if (pthread_mutex_trylock(&btf_encode_lock) != 0) { + /* Another thread is busy encoding. + * Check the number of CUs in queue, and if it's too high, wait. + */ + while (cus__nr_entries(cus) >= conf_load->nr_jobs - 1) { + usleep(1000); + cus__remove_processed_cus(cus); } + goto out; } - free(threads[0]); - return err; + /* Got the lock: check whether btf_encoder has already been created */ + if (!btf_encoder) { + if (cu->id == 0) + btf_encoder__init(cu, conf_load); + goto out_unlock; + } + + /* Proceed to encoding in order of cu->id + * cus->cus serves as a queue of loaded cu-s + * Keep encoding until a cu with expected id is not yet loaded + */ + do { + cu_to_encode = cus__get_cu_by_id(cus, next_cu_id_to_encode); + if (!cu_to_encode || cu_to_encode->state != CU__LOADED) + break; + + encoding_ret = btf_encoder__encode_cu(btf_encoder, + cu_to_encode, + conf_load); + if (encoding_ret < 0) { + fprintf(stderr, "Encountered error while encoding BTF.\n"); + exit(1); + } + + cus__set_cu_state(cus, cu_to_encode, CU__PROCESSED); + next_cu_id_to_encode++; + } while (true); + +out_unlock: + pthread_mutex_unlock(&btf_encode_lock); +out: + // always keep incoming CUs + // they will be deleted by cus__remove_processed_cus() + return LSK__KEEPIT; +} + +static enum load_steal_kind pahole_stealer__btf_encode(struct cu *cu, struct conf_load *conf_load) +{ + int ret; + + if (!btf_encoder) + btf_encoder__init(cu, conf_load); + + ret = btf_encoder__encode_cu(btf_encoder, cu, conf_load); + if (ret < 0) { + fprintf(stderr, "Encountered error while encoding BTF.\n"); + exit(1); + } + + // This has no meaning for a single thread, but let's be consistent + cus__set_cu_state(cus, cu, CU__PROCESSED); + + return ret; } static enum load_steal_kind pahole_stealer(struct cu *cu, @@ -3238,94 +3293,10 @@ static enum load_steal_kind pahole_stealer(struct cu *cu, return LSK__DELETE; // Maybe we can find this in several CUs, so don't stop it if (btf_encode) { - static pthread_mutex_t btf_lock = PTHREAD_MUTEX_INITIALIZER; - struct btf_encoder *encoder; - - pthread_mutex_lock(&btf_lock); - /* - * FIXME: - * - * This should be really done at main(), but since in the current codebase only at this - * point we'll have cu->elf setup... - */ - if (!btf_encoder) { - /* - * btf_encoder is the primary encoder. - * And, it is used by the thread - * create it. - */ - btf_encoder = btf_encoder__new(cu, detached_btf_filename, conf_load->base_btf, - global_verbose, conf_load); - if (btf_encoder && thr_data) { - struct thread_data *thread = thr_data; - - thread->encoder = btf_encoder; - thread->btf = btf_encoder__btf(btf_encoder); - } - } - - // Reproducible builds don't have multiple btf_encoders, so we need to keep the lock until we encode BTF for this CU. - if (thr_data) - pthread_mutex_unlock(&btf_lock); - - if (!btf_encoder) { - ret = LSK__STOP_LOADING; - goto out_btf; - } - - /* - * thr_data keeps per-thread data for worker threads. Each worker thread - * has an encoder. The main thread will merge the data collected by all - * these encoders to btf_encoder. However, the first thread reaching this - * function creates btf_encoder and reuses it as its local encoder. It - * avoids copying the data collected by the first thread. - */ - if (thr_data) { - struct thread_data *thread = thr_data; - - if (thread->encoder == NULL) { - thread->encoder = - btf_encoder__new(cu, detached_btf_filename, - NULL, - global_verbose, - conf_load); - thread->btf = btf_encoder__btf(thread->encoder); - } - encoder = thread->encoder; - } else { - encoder = btf_encoder; - } - - // Since we don't have yet a way to parallelize the BTF encoding, we - // need to ask the loader for the next CU that we can process, one - // that is loaded and is in order, if the next one isn't yet loaded, - // then return to let the DWARF loader thread to load the next one, - // eventually all will get processed, even if when all DWARF loading - // threads finish. - if (conf_load->reproducible_build) { - ret = LSK__KEEPIT; // we're not processing the cu passed to this - // function, so keep it. - cu = cus__get_next_processable_cu(cus); - if (cu == NULL) - goto out_btf; - } - - ret = btf_encoder__encode_cu(encoder, cu, conf_load); - if (ret < 0) { - fprintf(stderr, "Encountered error while encoding BTF.\n"); - exit(1); - } - - if (conf_load->reproducible_build) { - ret = LSK__KEEPIT; // we're not processing the cu passed to this function, so keep it. - // Kinda equivalent to LSK__DELETE since we processed this, but we can't delete it - // as we stash references to entries in CUs for 'struct function' in btf_encoder__add_saved_funcs() - // and btf_encoder__save_func(), so we can't delete them here. - Alan Maguire - } -out_btf: - if (!thr_data) // See comment about reproducibe_build above - pthread_mutex_unlock(&btf_lock); - return ret; + if (conf_load->nr_jobs > 1) + return pahole_stealer__threaded_btf_encode(cu, conf_load); + else + return pahole_stealer__btf_encode(cu, conf_load); } #if 0 if (ctf_encode) { @@ -3625,24 +3596,6 @@ out_free: return ret; } -static int cus__flush_reproducible_build(struct cus *cus, struct btf_encoder *encoder, struct conf_load *conf_load) -{ - int err = 0; - - while (true) { - struct cu *cu = cus__get_next_processable_cu(cus); - - if (cu == NULL) - break; - - err = btf_encoder__encode_cu(encoder, cu, conf_load); - if (err < 0) - break; - } - - return err; -} - int main(int argc, char *argv[]) { int err, remaining, rc = EXIT_FAILURE; @@ -3731,18 +3684,12 @@ int main(int argc, char *argv[]) if (languages.exclude) conf_load.early_cu_filter = cu__filter; - conf_load.thread_exit = pahole_thread_exit; - - if (conf_load.reproducible_build) { - conf_load.threads_prepare = pahole_threads_prepare_reproducible_build; - conf_load.threads_collect = NULL; - } else { + if (btf_encode) { + conf_load.pre_load_module = btf_encoder__pre_load_module; conf_load.threads_prepare = pahole_threads_prepare; conf_load.threads_collect = pahole_threads_collect; - } + conf_load.thread_exit = pahole_thread_exit; - if (btf_encode) { - conf_load.pre_load_module = btf_encoder__pre_load_module; err = btf_encoding_context__init(); if (err < 0) goto out; @@ -3847,13 +3794,9 @@ try_sole_arg_as_class_names: header = NULL; if (btf_encode && btf_encoder) { // maybe all CUs were filtered out and thus we don't have an encoder? - if (conf_load.reproducible_build && - cus__flush_reproducible_build(cus, btf_encoder, &conf_load) < 0) { - fprintf(stderr, "Encountered error while encoding BTF.\n"); - exit(1); - } - if (conf_load.nr_jobs <= 1 || conf_load.reproducible_build) + // pahole_threads_collect() is not called in single-threaded runs + if (conf_load.nr_jobs <= 1) btf_encoder__add_saved_funcs(btf_encoder); err = btf_encoder__encode(btf_encoder); -- 2.47.0