Re: [PATCH dwarves 2/2] pahole: Use per-thread btf instances to avoid mutex locking.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Jan 19, 2022 at 5:09 PM Kui-Feng Lee <kuifeng@xxxxxx> wrote:
>
> Create an instance of btf for each worker thread, and add type info to
> the local btf instance in the steal-function of pahole without mutex
> acquiring.  Once finished with all worker threads, merge all
> per-thread btf instances to the primary btf instance.
>
> Signed-off-by: Kui-Feng Lee <kuifeng@xxxxxx>
> ---

Just a bunch of nits, I'll relay to Arnaldo to verify the logic is sound

>  btf_encoder.c |   5 +++
>  btf_encoder.h |   2 +
>  pahole.c      | 117 +++++++++++++++++++++++++++++++++++++++++++++++---
>  3 files changed, 119 insertions(+), 5 deletions(-)
>
> diff --git a/btf_encoder.c b/btf_encoder.c
> index 9d015f304e92..96a6c1d5fba0 100644
> --- a/btf_encoder.c
> +++ b/btf_encoder.c
> @@ -1529,3 +1529,8 @@ int btf_encoder__encode_cu(struct btf_encoder *encoder, struct cu *cu)
>  out:
>         return err;
>  }
> +
> +struct btf *btf_encoder__get_btf(struct btf_encoder *encoder)

nit: btf_encoder__btf() is how we'd call this in libbpf, not sure
which naming convention Arnaldo prefers

> +{
> +       return encoder->btf;
> +}
> diff --git a/btf_encoder.h b/btf_encoder.h
> index f133b0d7674d..e141ec2e677d 100644
> --- a/btf_encoder.h
> +++ b/btf_encoder.h
> @@ -29,4 +29,6 @@ struct btf_encoder *btf_encoders__first(struct list_head *encoders);
>
>  struct btf_encoder *btf_encoders__next(struct btf_encoder *encoder);
>
> +struct btf *btf_encoder__get_btf(struct btf_encoder *encoder);
> +
>  #endif /* _BTF_ENCODER_H_ */
> diff --git a/pahole.c b/pahole.c
> index f3eeaaca4cdf..56bd7dc4e62e 100644
> --- a/pahole.c
> +++ b/pahole.c
> @@ -2798,6 +2798,73 @@ out:
>
>  static struct type_instance *header;
>
> +struct thread_data {
> +       struct btf *btf;
> +       struct btf_encoder *encoder;
> +};
> +
> +static int pahole_threads_prepare(struct conf_load *conf, int nr_threads, void **thr_data)
> +{
> +       int i;
> +       struct thread_data *threads =
> +               (struct thread_data *)calloc(sizeof(struct thread_data), nr_threads);

nit: split variable declaration and initialization, if it's not
fitting on a single line? You also don't need casting of calloc()
result in C.

> +
> +       for (i = 0; i < nr_threads; i++)
> +               thr_data[i] = threads + i;
> +
> +       return 0;
> +}
> +
> +static int pahole_thread_exit(struct conf_load *conf, void *thr_data)
> +{
> +       struct thread_data *thread = (struct thread_data *)thr_data;

same, casting not needed

> +
> +       if (thread == NULL)

nit: !thread

> +               return 0;
> +
> +       /*
> +        * It will call dedup here in the future once we have fixed
> +        * the known performance regression of libbpf.
> +        */

uhm... it's not a regression, it's BTF dedup algorithm not really
supporting partial dedup and subsequent dedup. I'd drop this comment
for now, it just adds confusion, IMO.

> +
> +       return 0;
> +}
> +
> +static int pahole_threads_collect(struct conf_load *conf, int nr_threads, void **thr_data,
> +                                 int error)
> +{
> +       struct thread_data **threads = (struct thread_data **)thr_data;
> +       int i;
> +       int err = 0;
> +
> +       if (error == 0) {

reduce nesting:

if (error)
    goto out;

> +               for (i = 0; i < nr_threads; i++) {
> +                       /*
> +                        * Merge cotnent of the btf instances of

typo: content

> +                        * worker threads to the btf instance of the
> +                        * primary btf_encoder.
> +                        */
> +                       if (!threads[i]->btf || threads[i]->encoder == btf_encoder)
> +                               continue; /* The primary btf_encoder */
> +                       err = btf__add_btf(btf_encoder__get_btf(btf_encoder), threads[i]->btf);

if you do btf__add_btf() in pahole_thread_exit() (under mutex, but
that's fine), you can offload some time spent in btf__Add_btf() to
worker threads. Have you tried that? I bet you'll get a bit of speed
up that way as well.

> +                       if (err < 0)
> +                               goto out;
> +                       err = 0;

move this to after the for loop

> +                       btf_encoder__delete(threads[i]->encoder);
> +                       threads[i]->encoder = NULL;
> +               }
> +       }
> +
> +out:
> +       for (i = 0; i < nr_threads; i++) {
> +               if (threads[i]->encoder && threads[i]->encoder != btf_encoder)
> +                       btf_encoder__delete(threads[i]->encoder);
> +       }
> +       free(threads[0]);
> +
> +       return err;
> +}
> +
>  static enum load_steal_kind pahole_stealer(struct cu *cu,
>                                            struct conf_load *conf_load,
>                                            void *thr_data)
> @@ -2819,8 +2886,8 @@ static enum load_steal_kind pahole_stealer(struct cu *cu,
>
>         if (btf_encode) {
>                 static pthread_mutex_t btf_lock = PTHREAD_MUTEX_INITIALIZER;
> +               struct btf_encoder *encoder;
>
> -               pthread_mutex_lock(&btf_lock);
>                 /*
>                  * FIXME:
>                  *
> @@ -2828,21 +2895,58 @@ static enum load_steal_kind pahole_stealer(struct cu *cu,
>                  * point we'll have cu->elf setup...
>                  */
>                 if (!btf_encoder) {
> -                       btf_encoder = btf_encoder__new(cu, detached_btf_filename, conf_load->base_btf, skip_encoding_btf_vars,
> -                                                      btf_encode_force, btf_gen_floats, global_verbose);
> +                       int new_btf_encoder = 0;

this is boolean variable, use bool

> +
> +                       pthread_mutex_lock(&btf_lock);
> +                       if (!btf_encoder) {
> +                               /*
> +                                * btf_encoder is the primary encoder.
> +                                * And, it is used by the thread
> +                                * create it.
> +                                */
> +                               btf_encoder = btf_encoder__new(cu, detached_btf_filename,
> +                                                              conf_load->base_btf,
> +                                                              skip_encoding_btf_vars,
> +                                                              btf_encode_force,
> +                                                              btf_gen_floats, global_verbose);
> +                               new_btf_encoder = 1;
> +                       }
> +                       pthread_mutex_unlock(&btf_lock);
>                         if (btf_encoder == NULL) {
>                                 ret = LSK__STOP_LOADING;
>                                 goto out_btf;
>                         }
> +                       if (new_btf_encoder && thr_data) {
> +                               struct thread_data *thread = (struct thread_data *)thr_data;
> +
> +                               thread->encoder = btf_encoder;
> +                               thread->btf = btf_encoder__get_btf(btf_encoder);

maybe do this initialization right after you created btf_encode above
(under lock is fine, there is nothing slow here)

> +                       }
>                 }
>
> -               if (btf_encoder__encode_cu(btf_encoder, cu)) {
> +               if (thr_data) {
> +                       struct thread_data *thread = (struct thread_data *)thr_data;
> +
> +                       if (thread->encoder == NULL) {
> +                               thread->encoder =
> +                                       btf_encoder__new(cu, detached_btf_filename,
> +                                                        NULL,
> +                                                        skip_encoding_btf_vars,
> +                                                        btf_encode_force,
> +                                                        btf_gen_floats,
> +                                                        global_verbose);
> +                               thread->btf = btf_encoder__get_btf(thread->encoder);
> +                       }
> +                       encoder = thread->encoder;
> +               } else
> +                       encoder = btf_encoder;
> +
> +               if (btf_encoder__encode_cu(encoder, cu)) {
>                         fprintf(stderr, "Encountered error while encoding BTF.\n");
>                         exit(1);
>                 }
>                 ret = LSK__DELETE;
>  out_btf:
> -               pthread_mutex_unlock(&btf_lock);
>                 return ret;
>         }
>  #if 0
> @@ -3207,6 +3311,9 @@ int main(int argc, char *argv[])
>         memset(tab, ' ', sizeof(tab) - 1);
>
>         conf_load.steal = pahole_stealer;
> +       conf_load.thread_exit = pahole_thread_exit;
> +       conf_load.threads_prepare = pahole_threads_prepare;
> +       conf_load.threads_collect = pahole_threads_collect;
>
>         // Make 'pahole --header type < file' a shorter form of 'pahole -C type --count 1 < file'
>         if (conf.header_type && !class_name && prettify_input) {
> --
> 2.30.2
>



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux